wow-node-plotting/NodeData.py at main · Sillocan/wow-node-plotting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import requests
import demjson3 as demjson
import re
import ast
from bs4 import BeautifulSoup
from NodeClasses import *
import itertools
from operator import attrgetter
from enum import Enum, auto


class WowheadDatasource(Enum):
    LIVE = ""
    PTR = "ptr/"
    BETA = "beta/"


current_source = WowheadDatasource.LIVE


node_db = {}
fish_db = {}
map_db = {
    # 11510:Map("Ardenweald"),
    # 10534:Map("Bastion"),
    # 11462:Map("Maldraxxus"),
    # 10413:Map("Revendreth"),
    # 11400:Map("The Maw"),
    # 13644: Map("The Waking Shores"),
    # 13646: Map("The Azure Span"),
    # 13645: Map("Ohn'ahran Plains"),
    # 13647: Map("Thaldraszus"),
    # 13862: Map("Valdrakken"),
    # 14022: Map("Zaralek Cavern"),
    14717: Map("Isle of Dorn"),
    14795: Map("The Ringing Deeps"),
    14838: Map("Hallowfall"),
    14752: Map("Azj-Kahet"),
}


classs_to_find = 7


term_info_MINE = dict(name="Metal & Stone", term="WH.TERMS.minedfrom,")
term_info_FISH = dict(name="Cooking", term="WH.TERMS.fishedin,")
term_info_HERB = dict(name="Herb", term="WH.TERMS.gatheredfrom,")


subclass_object_lookup = {7: term_info_MINE, 8: term_info_FISH, 9: term_info_HERB}


def reset_nodes():
    global map_db, node_db, fish_db
    for m in map_db.values():
        m.clear()
    # map_db.clear()
    node_db.clear()
    fish_db.clear()


def make_soup_instance(type_name, obj_id) -> BeautifulSoup:
    url = f"https://www.wowhead.com/{current_source.value}{type_name}={obj_id}"
    hdr = {
        "User-Agent": "Mozilla/5.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }
    page = requests.get(url, headers=hdr)
    return BeautifulSoup(page.text, "html.parser")


def scrape_short_title_name(type_name, obj_id, soup=None):
    if soup is None:
        soup = make_soup_instance(type_name, obj_id)
    content = soup.find("title")
    if not content:
        raise RuntimeError(f"Could not find title for {type_name} with id {obj_id}")
    return content.contents[0].split("-")[0].strip()


def scrape_subclass_object(item_id, soup, lookup_type="id"):
    # Define lookups
    item_info_lookup = f'"{lookup_type}":{item_id}'
    classs_regex = re.compile('"classs":([0-9]+),')
    subclass_regex = re.compile('"subclass":([0-9]+),')

    # find subclass and classs based on item_info_lookup
    item_info_js = soup.find(text=re.compile(item_info_lookup))
    if item_info_js is None:
        print(f"Could not find item_info for item_id {item_id}")
        return

    # Since there are multiple js lines in this block, we need to split the data and find right lines
    classs, subclass = None, None
    for line in item_info_js.split(";"):
        if item_info_lookup in line:
            # classs = int(re.search(classs_regex, line).group(1))
            subclass = int(re.search(subclass_regex, line).group(1))
            break

    # Exit if not in lookup or invalid classs
    valid_classs = classs == classs_to_find
    valid_subclass = subclass in subclass_object_lookup
    if not (valid_subclass):
        print(
            f"Item_id {item_id} does not have correct class/subclass. Classs=[{classs}]; Subclass=[{subclass}]"
        )
        return None

    return subclass_object_lookup[subclass]


def scrape_gathered_data_from_term(soup, term):
    # Find data based on passed term
    content = soup.find(text=re.compile(term))
    if content is None:
        return None

    # Since there can be multiple lines in this ListView, we need to split and find valid lines
    list_views = content.split(";")
    object_info = []
    for view in list_views:
        if term in view:
            object_info.extend(
                ast.literal_eval(re.search("data: (\[.*\])", view).group(1))
            )

    return object_info


def parse_fished_from_gathered_data(gathered_data, item_id):
    global fish_db, map_db
    for node in gathered_data:
        map_id = node["id"]

        if item_id not in fish_db:
            fish_db[item_id] = None
        if map_id == -1:
            print(
                f"ERROR: item {item_id} is mapped to an invalid map id {map_id}. Skipping this map."
            )
            continue
        if map_id not in map_db:
            map_db[map_id] = Map(scrape_short_title_name("zone", map_id))


def parse_nodes_from_gathered_data(gathered_data):
    # TODO: Return data from this function and cache data based on the term used to find the data.
    global node_db, map_db
    for node in gathered_data:
        node_id = node["id"]
        node_name = node["name"]
        node_maps = node["location"]

        if node_id not in node_db:
            node_db[node_id] = {"name": node_name, "processed": False}

        for map_id in node_maps:
            if map_id == -1:
                print(
                    f"ERROR: {node_id} '{node_name}' is mapped to an invalid map id {map_id}. Skipping this map."
                )
                continue
            if map_id not in map_db:
                map_db[map_id] = Map(scrape_short_title_name("zone", map_id))


def scrape_map_nodes_from_item(item_id):
    soup = make_soup_instance("object", item_id)

    # with open(f"tmp/{item_id}.html", "w") as f:
    #     f.write(soup.prettify())

    subclass_object = scrape_subclass_object(item_id, soup)
    if subclass_object is None:
        return

    gathered_data = scrape_gathered_data_from_term(soup, subclass_object["term"])
    if gathered_data is None:
        print(f"No valid nodes found for {item_id}")
        return

    if subclass_object["term"] == term_info_FISH["term"]:
        parse_fished_from_gathered_data(gathered_data, item_id)
    else:
        parse_nodes_from_gathered_data(gathered_data)


def scape_item_ids(item_names):
    for map_id in map_db.keys():
        print(map_id)
        soup = make_soup_instance("zone", map_id)

        # with open(f"{map_id}.html", "w") as f:
        #     f.write(soup.prettify())

        term = "WH.TERMS.objects"
        content = soup.find(text=re.compile(term))

        for list_view in content.split(";\n"):
            if term in list_view:
                objects = ast.literal_eval(
                    re.search("data: (\[.*\])", list_view).group(1)
                )
                for obj in objects:
                    if obj["name"] in item_names:
                        yield obj["id"], obj["name"]


# def scrape_map_nodes_from_item(item_id):
#     global map_db, node_db
#     soup = make_soup_instance("item", item_id)
#     content = soup.find(text=re.compile("-from-object"))
#     if content is None:
#         print(f"Could not find content for {item_id}")
#         return
#
#     list_views = content.split(';')
#     object_info = []
#     for view in list_views:
#         if '-from-object' in view:
#             object_info.extend(ast.literal_eval(re.search('data: (\\[.*\\])', view).group(1)))
#     for node in object_info:
#         node_id = node['id']
#         node_name = node['name']
#         node_maps = node['location']
#
#         if node_id not in node_db:
#             node_db[node_id] = node_name
#
#         for map_id in node_maps:
#             if map_id == -1:
#                 print(f"ERROR: {node_id} '{node_name}' is mapped to an invalid map id {map_id}. Skipping this map.")
#                 continue
#             if map_id not in map_db:
#                 map_db[map_id] = Map(scrape_short_title_name("zone", map_id))


def construct_node_lists_from_wowhead_data():
    for key, value in node_db.items():
        print(f"Parsing node: ({key}: {value})")
        if value["processed"]:
            continue
        parse_wowhead_data(key)
        node_db[key]["processed"] = True

    for key in fish_db:
        # print(f"Parsing item: {key}")
        parse_wowhead_data(key, node_type="item")

    for map_uid in map_db:
        print(f"Parsing map: {map_uid}")
        data = sorted(map_db[map_uid].node_set, key=attrgetter("id", "x", "y"))
        output = "\n\t".join(
            f"{k}: {list(g)}" for k, g in itertools.groupby(data, attrgetter("id"))
        )
        # print(f"Map: {map_db[map_uid].name}\n\t{output}")


def parse_wowhead_data(node_id, node_type="object"):
    # node_id = 276238
    soup = make_soup_instance(node_type, node_id)
    # soup.prettify()
    scripts = []
    for tag in soup.find_all("script"):
        for content in tag.contents:
            if "g_mapperData" in content:
                scripts.append(content)

    for script in scripts:
        script = str(script)
        mapperdata = script.split("g_mapperData =")[1]
        mapperdata = mapperdata.split(";")[0]
        mapperdata = mapperdata.replace("\n", "")
        js_obj = mapperdata.replace(" ", "")
        # js_obj = str(script).split('g_mapperData =')[1].split(';')[0].replace('\n', '').replace(" ", "")
        # print("RAW:",js_obj)  # DEBUG PRINT
        wowhead_data = demjson.decode(js_obj)
        # print("PYTHON DICT:",wowhead_data)  # DEBUG PRINT
        # # add to node db
        # node_db[node_id] = wowhead_data

        # parse coords and add to map_db
        for map_uid in wowhead_data:
            # print(map_uid)  # DEBUG PRINT
            data = wowhead_data[map_uid][0].get("coords")
            if not len(data):
                continue

            # Make a map entry in the database
            curr_map = map_db.get(int(map_uid), Map("lookup later(id is %s" % map_uid))
            for coord in data:
                # print("\t", coord)  # DEBUG PRINT
                local_node_loc = MapNodeLocation(node_id, coord[0], coord[1])
                curr_map.add_node(local_node_loc)
            map_db[int(map_uid)] = curr_map