-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
138 lines (111 loc) · 4.37 KB
/
Copy pathmain.py
File metadata and controls
138 lines (111 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from bs4 import BeautifulSoup
import csv
import requests
from urllib.parse import urljoin
import time
def search_job(keyword,writer):
total=0
keyword= keyword.replace(" ","+") if " " in keyword else keyword
url=f"https://www.python.org/search/?q={keyword}&submit="
homepage=False
while True:
time.sleep(1)
req=requests.get(url)
if req.status_code!=200:
print("Request failed")
break
soup=BeautifulSoup(req.text,"lxml")
jobs=soup.select(".list-recent-events li")
if "No results found." in jobs[-1].text:
print("No results found.")
break
else:
total+=page_scraper(jobs,url,homepage,writer)
temp=soup.select_one(".list-recent-events")
next_div = temp.find_next_sibling("div")
if not next_div:
print("Reached last page")
break
next_url=None
next_tag=next_div.select("a")
for link in next_tag:
if "Next" in link.text:
next_url=link.get("href")
break
if not next_url:
print("Reached last page")
break
url=urljoin(url,next_url)
print(f"\nTotal jobs scraped: {total}"+",\nNo results found." if total==0 and "No results found." not in jobs[-1].text else "")
def homepage(writer):
total=0
url= "https://www.python.org/jobs/"
homepage=True
req=requests.get(url)
if req.status_code!=200:
print("Request failed")
else:
soup=BeautifulSoup(req.text,"lxml")
jobs=soup.select(".list-recent-jobs li")
if jobs:
total+=page_scraper(jobs,url,homepage,writer)
else:
print("No results found.")
print(f"\nTotal jobs scraped: {total}"+",no results found" if total==0 else "")
def page_scraper(jobs,url,homepage,writer):
count=0
for job in jobs:
job_url=job.select_one(".listing-company a").get("href") if homepage else job.select_one("h3 a").get("href")
if "/jobs/" in job_url:
job_url=urljoin(url,job_url)
else:
continue
sub_req=requests.get(job_url)
if sub_req.status_code!=200:
print("Request failed")
continue
sub_soup=BeautifulSoup(sub_req.text,"lxml")
desc=sub_soup.select_one(".job-description")
if desc:
text=desc.get_text(separator=" ").strip()
bad_words=["Key information","Position Details","Responsibilities","About the role","About the company"]
if "Job Description" in text:
text= text.split("Job Description",1)[1].strip()
text=" ".join(text.split())
for word in bad_words:
text=text.replace(word,"")
text=text.lstrip(":, ")
brief=text[:150]+"....." if len(text)>150 else text
company_tag=sub_soup.select_one(".company-name")
if company_tag:
data=list(company_tag.stripped_strings)
job_title=data[0]
company_name=data[-1]
else:
job_title= "Not available"
company_name= "Not available"
location_tag =sub_soup.select_one(".listing-location")
location=location_tag.text.strip() if location_tag else "Not specified"
date_tag=sub_soup.select_one(".listing-posted time")
date=date_tag.text.strip() if date_tag else "Not available"
writer.writerow([job_title,company_name,location,brief,date,job_url])
count+=1
else:
continue
return count
def jobs_scraper():
with open("Jobs.csv","w",encoding="utf-8-sig") as file:
writer=csv.writer(file)
writer.writerow(["Job title","Company","Location","Description","Posted on","Job link"])
print(f'{"="*40} \n \t PYTHON JOB SCRAPER \n{"="*40}\n ' )
print("Enter keywords to search jobs\n"
"Or press Enter to scrape the latest jobs")
keyword=input("Search: ")
if keyword:
print("Searching......🔍")
search_job(keyword,writer)
else:
print("Scraping latest jobs......")
homepage(writer)
if __name__=="__main__":
jobs_scraper()