1+
2+
3+ import os
4+ import glob
5+ from typing import Union
6+ import re
7+ import json
8+ import logging
9+
10+ from scrapy .spiders import Spider
11+ from scrapy .http import HtmlResponse
12+ from scrapy .crawler import CrawlerProcess
13+ from scrapy .utils .project import get_project_settings
14+
15+ from playwright .async_api import async_playwright
16+
17+ from maple_structures import Article , Author
18+
19+ logger = logging .getLogger ("CTVNewsSpider" )
20+
21+ class CTVNewsParser :
22+ @staticmethod
23+ def parse_article (response : HtmlResponse ) -> Union [Article , None ]:
24+ article = None
25+ try :
26+ article = CTVNewsParser .parse_from_json (response )
27+ except json .JSONDecodeError as e :
28+ logger .warning ('Error parsing article from JSON: %s' , e )
29+
30+ if article is None :
31+ try :
32+ article = CTVNewsParser .parse_from_html (response )
33+ except NotImplementedError as e :
34+ logger .warning ('HTML parsing not implemented: %s' , e )
35+ except Exception as e :
36+ logger .warning ('Error parsing article from HTML: %s' , e )
37+
38+ return article
39+
40+
41+ @staticmethod
42+ def parse_from_json (response : HtmlResponse ) -> Union [Article , None ]:
43+ js = response .xpath ('//script[@id="fusion-metadata"]//text()' ).extract_first ()
44+
45+ if js :
46+ match = re .search (r'Fusion\.globalContent\s*=\s*({.*?});' , js , re .DOTALL )
47+ if match :
48+ json_str = match .group (1 )
49+ data = json .loads (json_str )
50+ article = Article ()
51+ article .url = response .url
52+
53+ # Title
54+ if 'headlines' in data :
55+ if 'basic' in data ['headlines' ]:
56+ article .title = data ['headlines' ]['basic' ]
57+ # Content
58+ if 'content_elements' in data :
59+ if isinstance (data ['content_elements' ], list ):
60+ content = article .title
61+ for element in data ['content_elements' ]:
62+ if 'type' in element and element ['type' ] == 'text' :
63+ if 'content' in element :
64+ content += '\n ' + element ['content' ]
65+ elif 'type' in element and element ['type' ] == 'header' :
66+ if 'content' in element :
67+ content += '\n ' + element ['content' ] + '\n '
68+ else :
69+ if 'content' in element :
70+ content += ' ' + element ['content' ]
71+ article .content = content
72+ # Date Published
73+ if 'first_publish_date' in data :
74+ article .date_published = data ['first_publish_date' ]
75+ elif 'publish_date' in data :
76+ article .date_published = data ['publish_date' ]
77+ if 'last_updated_date' in data :
78+ article .date_modified = data ['last_updated_date' ]
79+
80+ if 'language' in data :
81+ article .language = data ['language' ]
82+
83+ # Authors
84+ if 'credits' in data :
85+ if 'by' in data ['credits' ]:
86+ for author in data ['credits' ]['by' ]:
87+ new_author = Author ()
88+ if 'additional_properties' in author :
89+ if 'original' in author ['additional_properties' ]:
90+ original = author ['additional_properties' ]['original' ]
91+ name = ''
92+ if 'firstName' in original :
93+ name += original ['firstName' ]
94+ if 'lastName' in original :
95+ name += ' ' + original ['lastName' ]
96+ new_author .name = name
97+
98+ if 'email' in original :
99+ new_author .email = original ['email' ]
100+ else :
101+ new_author .email = None
102+ if 'bio_page' in original :
103+ new_author .url = response .urljoin (original ['bio_page' ])
104+
105+ # if best format is not found, try to parse from other fields
106+ elif 'type' in author and author ['type' ] == 'author' :
107+ if 'name' in author :
108+ new_author .name = author ['name' ]
109+ if 'url' in author :
110+ new_author .url = response .urljoin (author ['url' ])
111+ if 'social_links' in author :
112+ if 'url' in author ['social_links' ]:
113+ if 'site' in author ['social_links' ] and author ['social_links' ]['site' ] == 'email' :
114+ new_author .email = author ['social_links' ]['url' ]
115+ if new_author .name != '' :
116+ article .add_author (new_author )
117+ return article
118+ raise ValueError ('Article not found in JSON' )
119+
120+ @staticmethod
121+ def parse_from_html (response : HtmlResponse ) -> Union [Article , None ]:
122+ raise NotImplementedError ()
123+
124+
125+ class CTVNewsSpider (Spider ):
126+ name = 'CTVNewsSpider'
127+ sample_file_location = None
128+
129+ def __init__ (self , on_article_content : callable = None , ** kwargs ):
130+ super ().__init__ (self .name , ** kwargs )
131+ self .start_urls = ['https://www.ctvnews.ca' ]
132+ self .on_article_content = on_article_content
133+ self .visited = []
134+ self .count = 0
135+ if self .sample_file_location is not None :
136+ os .makedirs (self .sample_file_location , exist_ok = True )
137+
138+ async def parse (self , response ):
139+ if response .url in self .visited :
140+ yield None
141+ self .visited .append (response .url )
142+ for href in response .css ('a::attr(href)' ).getall ():
143+ if self .start_urls [0 ] in href or href .startswith ('/' ):
144+ if '/article/' in href :
145+ yield await self .parse_article (response .urljoin (href ))
146+ else :
147+ yield response .follow (href , self .parse )
148+ else :
149+ print (f'Invalid external link: { href } ' )
150+
151+ async def parse_article (self , url ):
152+ article = None
153+ async with async_playwright () as p :
154+ browser = await p .chromium .launch (headless = True )
155+ page = await browser .new_page ()
156+
157+ await page .goto (url ) # , wait_until='networkidle'
158+
159+ content = await page .content ()
160+ htmlresponse = HtmlResponse (url = page .url , body = content , encoding = 'utf-8' )
161+
162+ article = CTVNewsParser .parse_article (htmlresponse )
163+
164+ if self .sample_file_location is not None :
165+ # Store screenshot
166+ await page .screenshot (
167+ path = os .path .join (self .sample_file_location , f'{ self .count } -screen.png' ),
168+ full_page = True )
169+
170+ # Store article JSON
171+ json .dump (article .to_dict (), open (os .path .join (self .sample_file_location , f'{ self .count } -article.json' ), 'w' , encoding = 'utf-8' ), indent = 4 )
172+
173+ # Store HTML content
174+ html = htmlresponse .body .decode ('utf-8' )
175+ with open (
176+ os .path .join (self .sample_file_location ,f'{ self .count } -contenthtml.html' ),
177+ 'w' ,
178+ encoding = 'utf-8' ) as file :
179+ file .write (html )
180+
181+ browser .close ()
182+ self .count += 1
183+
184+ return article .to_dict ()
185+
186+
187+ if __name__ == "__main__" :
188+
189+ SAMPLE_FOLDER = 'sample3'
190+
191+ # cleanup sample folder.
192+ files = glob .glob (os .path .join (SAMPLE_FOLDER , '*' ))
193+ for f in files :
194+ os .remove (f )
195+
196+ settings = get_project_settings ()
197+ settings ['PLAYWRIGHT_BROWSER_TYPE' ] = 'chromium'
198+ settings ['PLAYWRIGHT_LAUNCH_OPTIONS' ] = {'headless' : True }
199+ settings ['ROBOTSTXT_OBEY' ] = True
200+
201+ process = CrawlerProcess (settings = settings )
202+ CTVNewsSpider .sample_file_location = SAMPLE_FOLDER
203+ process .crawl (CTVNewsSpider )
204+ process .start ()
0 commit comments