-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlink.py
More file actions
23 lines (20 loc) · 857 Bytes
/
link.py
File metadata and controls
23 lines (20 loc) · 857 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
################################################################################
# Finds all the links on a wikipedia page HOME_URL and prints them to a file
################################################################################
from __future__ import print_function
import urllib2
from bs4 import BeautifulSoup
# Website to scrape
HOME_URL = 'http://en.wikipedia.org/wiki/Star'
# Request page content and make BeautifulSoup object
home_req = urllib2.Request(HOME_URL)
home_content = urllib2.urlopen(home_req)
soup = BeautifulSoup(home_content)
f = open('link.txt', 'w')
for parag in soup.find_all('p'):
for link in parag.find_all('a'):
blah = link.get_text().encode('ascii', 'ignore')
#Check for [ for references ie [42]
if blah != None and (len(blah)>2 and blah[0]!='['):
print(blah, file=f)
f.close()