Skip to content

Commit 21c2b31

Browse files
committed
RC-1286 #close #comment CTVNews scraper working. More changes toaccomodate data structures.
1 parent 9399e75 commit 21c2b31

5 files changed

Lines changed: 220 additions & 7 deletions

File tree

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"maple_proc",
1313
"maple_structures",
1414
"~/rcs-utils",
15+
"./newsscrapy"
1516
],
1617
"python.testing.unittestArgs": [
1718
"-v",

install.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ install_packages(){
2626

2727
pip install scrapy-fake-useragent
2828

29+
pip install scrapy-playwright
30+
31+
playwright install chromium
32+
2933
pip install --upgrade pip setuptools
3034

3135
pip install python-socketio python-socketio[client]

maple_structures/maple_structures/maple.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,10 @@ def about(self, value):
167167
setattr(self, "_about", value)
168168

169169
def to_dict(self):
170-
return self._to_dict_endpoint()
170+
author = self._to_dict_endpoint()
171+
if author['email'] == '':
172+
author.pop('email')
173+
return author
171174

172175
@staticmethod
173176
def from_json(data):
@@ -247,7 +250,7 @@ def to_dict(self):
247250
dict(name="title", type=str, default=""),
248251
dict(name="summary", type=str, default=""),
249252
dict(name="content", type=str, default=""),
250-
dict(name="author", type=list, default={},
253+
dict(name="author", type=list, default=[],
251254
secondary_type=Author, validator=Author.validate),
252255
dict(name="video_url", type=list, default=[],
253256
secondary_type=str, validator=validators.url),
@@ -305,8 +308,9 @@ def add_author(self, author: Author):
305308
)
306309
if getattr(self, '_author', []) is None:
307310
setattr(self, '_author', [])
308-
setattr(self, '_author', getattr(self, '_author', []).append(author))
309-
# self._author.append(author)
311+
authors = getattr(self, '_author', [])
312+
authors.append(author)
313+
setattr(self, '_author', authors)
310314

311315
def to_dict(self, *, suppress_null=True):
312316
'''converts Article to dictionary'''
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
2+
3+
import os
4+
import glob
5+
from typing import Union
6+
import re
7+
import json
8+
import logging
9+
10+
from scrapy.spiders import Spider
11+
from scrapy.http import HtmlResponse
12+
from scrapy.crawler import CrawlerProcess
13+
from scrapy.utils.project import get_project_settings
14+
15+
from playwright.async_api import async_playwright
16+
17+
from maple_structures import Article, Author
18+
19+
logger = logging.getLogger("CTVNewsSpider")
20+
21+
class CTVNewsParser:
22+
@staticmethod
23+
def parse_article(response: HtmlResponse) -> Union[Article, None]:
24+
article = None
25+
try:
26+
article = CTVNewsParser.parse_from_json(response)
27+
except json.JSONDecodeError as e:
28+
logger.warning('Error parsing article from JSON: %s', e)
29+
30+
if article is None:
31+
try:
32+
article = CTVNewsParser.parse_from_html(response)
33+
except NotImplementedError as e:
34+
logger.warning('HTML parsing not implemented: %s', e)
35+
except Exception as e:
36+
logger.warning('Error parsing article from HTML: %s', e)
37+
38+
return article
39+
40+
41+
@staticmethod
42+
def parse_from_json(response: HtmlResponse) -> Union[Article, None]:
43+
js = response.xpath('//script[@id="fusion-metadata"]//text()').extract_first()
44+
45+
if js:
46+
match = re.search(r'Fusion\.globalContent\s*=\s*({.*?});', js, re.DOTALL)
47+
if match:
48+
json_str = match.group(1)
49+
data = json.loads(json_str)
50+
article = Article()
51+
article.url = response.url
52+
53+
# Title
54+
if 'headlines' in data:
55+
if 'basic' in data['headlines']:
56+
article.title = data['headlines']['basic']
57+
# Content
58+
if 'content_elements' in data:
59+
if isinstance(data['content_elements'], list):
60+
content = article.title
61+
for element in data['content_elements']:
62+
if 'type' in element and element['type'] == 'text':
63+
if 'content' in element:
64+
content += '\n' + element['content']
65+
elif 'type' in element and element['type'] == 'header':
66+
if 'content' in element:
67+
content += '\n' + element['content'] + '\n'
68+
else:
69+
if 'content' in element:
70+
content += ' ' + element['content']
71+
article.content = content
72+
# Date Published
73+
if 'first_publish_date' in data:
74+
article.date_published = data['first_publish_date']
75+
elif 'publish_date' in data:
76+
article.date_published = data['publish_date']
77+
if 'last_updated_date' in data:
78+
article.date_modified = data['last_updated_date']
79+
80+
if 'language' in data:
81+
article.language = data['language']
82+
83+
# Authors
84+
if 'credits' in data:
85+
if 'by' in data['credits']:
86+
for author in data['credits']['by']:
87+
new_author = Author()
88+
if 'additional_properties' in author:
89+
if 'original' in author['additional_properties']:
90+
original = author['additional_properties']['original']
91+
name = ''
92+
if 'firstName' in original:
93+
name += original['firstName']
94+
if 'lastName' in original:
95+
name += ' ' + original['lastName']
96+
new_author.name = name
97+
98+
if 'email' in original:
99+
new_author.email = original['email']
100+
else:
101+
new_author.email = None
102+
if 'bio_page' in original:
103+
new_author.url = response.urljoin(original['bio_page'])
104+
105+
# if best format is not found, try to parse from other fields
106+
elif 'type' in author and author['type'] == 'author':
107+
if 'name' in author:
108+
new_author.name = author['name']
109+
if 'url' in author:
110+
new_author.url = response.urljoin(author['url'])
111+
if 'social_links' in author:
112+
if 'url' in author['social_links']:
113+
if 'site' in author['social_links'] and author['social_links']['site'] == 'email':
114+
new_author.email = author['social_links']['url']
115+
if new_author.name != '':
116+
article.add_author(new_author)
117+
return article
118+
raise ValueError('Article not found in JSON')
119+
120+
@staticmethod
121+
def parse_from_html(response: HtmlResponse) -> Union[Article, None]:
122+
raise NotImplementedError()
123+
124+
125+
class CTVNewsSpider(Spider):
126+
name = 'CTVNewsSpider'
127+
sample_file_location = None
128+
129+
def __init__(self, on_article_content: callable = None, **kwargs):
130+
super().__init__(self.name, **kwargs)
131+
self.start_urls = ['https://www.ctvnews.ca']
132+
self.on_article_content = on_article_content
133+
self.visited = []
134+
self.count=0
135+
if self.sample_file_location is not None:
136+
os.makedirs(self.sample_file_location, exist_ok=True)
137+
138+
async def parse(self, response):
139+
if response.url in self.visited:
140+
yield None
141+
self.visited.append(response.url)
142+
for href in response.css('a::attr(href)').getall():
143+
if self.start_urls[0] in href or href.startswith('/'):
144+
if '/article/' in href:
145+
yield await self.parse_article(response.urljoin(href))
146+
else:
147+
yield response.follow(href, self.parse)
148+
else:
149+
print(f'Invalid external link: {href}')
150+
151+
async def parse_article(self, url):
152+
article = None
153+
async with async_playwright() as p:
154+
browser = await p.chromium.launch(headless=True)
155+
page = await browser.new_page()
156+
157+
await page.goto(url) # , wait_until='networkidle'
158+
159+
content = await page.content()
160+
htmlresponse = HtmlResponse(url=page.url, body=content, encoding='utf-8')
161+
162+
article = CTVNewsParser.parse_article(htmlresponse)
163+
164+
if self.sample_file_location is not None:
165+
# Store screenshot
166+
await page.screenshot(
167+
path=os.path.join(self.sample_file_location, f'{self.count}-screen.png'),
168+
full_page=True)
169+
170+
# Store article JSON
171+
json.dump(article.to_dict(), open(os.path.join(self.sample_file_location, f'{self.count}-article.json'), 'w', encoding='utf-8'), indent=4)
172+
173+
# Store HTML content
174+
html = htmlresponse.body.decode('utf-8')
175+
with open(
176+
os.path.join(self.sample_file_location,f'{self.count}-contenthtml.html'),
177+
'w',
178+
encoding='utf-8') as file:
179+
file.write(html)
180+
181+
browser.close()
182+
self.count += 1
183+
184+
return article.to_dict()
185+
186+
187+
if __name__ == "__main__":
188+
189+
SAMPLE_FOLDER = 'sample3'
190+
191+
# cleanup sample folder.
192+
files = glob.glob(os.path.join(SAMPLE_FOLDER, '*'))
193+
for f in files:
194+
os.remove(f)
195+
196+
settings = get_project_settings()
197+
settings['PLAYWRIGHT_BROWSER_TYPE'] = 'chromium'
198+
settings['PLAYWRIGHT_LAUNCH_OPTIONS'] = {'headless': True}
199+
settings['ROBOTSTXT_OBEY'] = True
200+
201+
process = CrawlerProcess(settings=settings)
202+
CTVNewsSpider.sample_file_location = SAMPLE_FOLDER
203+
process.crawl(CTVNewsSpider)
204+
process.start()

runtime_scripts/data_fetcher.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,13 @@
1616

1717
sys.path.append(os.path.join(os.path.abspath(""), "newsscrapy"))
1818
sys.path.append(os.path.join(os.path.abspath(""), "../newsscrapy"))
19-
from newsscrapy.spiders import scrapyCBC, scrapyCTVNews
19+
from newsscrapy.spiders import scrapyCBC, scrapyCTVNews, CTVNewsV2
2020

2121
print(sys.path)
2222

2323

2424

2525

26-
2726
scrapy.utils.reactor.install_reactor(
2827
"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
2928
)
@@ -64,7 +63,8 @@
6463

6564
spiders_ = [
6665
scrapyCBC.CBCHeadlinesSpider,
67-
scrapyCTVNews.CTVNewsSpider,
66+
# scrapyCTVNews.CTVNewsSpider,
67+
CTVNewsV2.CTVNewsSpider,
6868
]
6969

7070

0 commit comments

Comments
 (0)