CodeAlpha_Web_Scraping/scraper1.py at main · Waheed-6907/CodeAlpha_Web_Scraping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#1. Accessing URL
import requests
from bs4 import BeautifulSoup

url="https://realpython.github.io/fake-jobs/"
response=requests.get(url)
print(response.status_code)
#2. Web Scraping
soup=BeautifulSoup(response.text,"html.parser")
jobs=soup.find_all("div",class_="card-content")
#3. Printing all jobs
for job in jobs:
    title=job.find("h2",class_="title").text.strip()
    company=job.find("h3",class_="company").text.strip()
    location=job.find("p",class_="location").text.strip()

    print(title,"-",company,"-",location)

#4. Converting into table
import pandas as pd

titles_list=[]
company_list=[]
location_list=[]

for job in jobs:
    titles_list.append(job.find("h2",class_="title").text.strip())
    company_list.append(job.find("h3",class_="company").text.strip())
    location_list.append(job.find("p",class_="location").text.strip())

print(f"Total jobs scraped: {len(jobs)}")

df=pd.DataFrame({
    "JOB TITLE":titles_list,
    "COMPANY":company_list,
    "LOCATION":location_list
})

df
#5. Save to Excel
df.to_excel("fake_jobs.xlsx", index=False)

#6. Filtering Data
#Location wise
print("JOBS IN LOCATION 'AA':")
aa_jobs=df[df["LOCATION"].str.contains("AA")]
print(aa_jobs)

print("JOBS IN LOCATION 'AE':")
ae_jobs=df[df["LOCATION"].str.contains("AE")]
print(ae_jobs)

#Domain wise
print("PYTHON JOBS:")
python_jobs=df[df["JOB TITLE"].str.contains("Python")]
print(python_jobs)

print("TEACHING JOBS:")
teaching_jobs=df[df["JOB TITLE"].str.contains("Teacher")]
print(teaching_jobs)
#Statistics
print(df["LOCATION"].value_counts())

top_location = df["LOCATION"].value_counts().idxmax()
print("Most jobs in:", top_location)