-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsec_parser.py
More file actions
79 lines (63 loc) · 2.16 KB
/
sec_parser.py
File metadata and controls
79 lines (63 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
base_url = r"https://www.sec.gov/cgi-bin/browse-edgar"
headers = {
"User-Agent": "My User Agent 1.1",
"From": "adgoch11@gmail.com",
}
param_dict = {
"action": "getcompany",
"CIK": "WMT",
"type": "10-k",
"owner": "exclude",
"output": "XML",
"count": "10",
}
response = requests.get(url=base_url, params=param_dict, headers=headers)
print(response.url)
soup = bs(response.content, "lxml")
print("Request Successful")
# define a base url that will be used for link building.
base_url_sec = r"https://www.sec.gov"
# find the document table with our data
doc_table = soup("table")[2]
doc_table = bs(str(doc_table), "html.parser")
# Locating the 10-k file links
strink = ""
for item in doc_table.find_all(string=re.compile("10-K")):
strink += str(item.parent.parent.prettify())
ten_k_rows = bs(strink, "html.parser")
with open("row.txt", "w") as file:
file.write(str(ten_k_rows.prettify()))
file_dict = {}
file_dict["file_type"] = param_dict["type"]
file_dict["ticker"] = param_dict["CIK"]
file_dict["links"] = {}
if len(ten_k_rows) != 0:
for row in ten_k_rows.find_all("tr"):
filing_date = str(
row.find_all("td")[3].text
).strip() # Extract date from the HTML directly
date = str(filing_date).replace("-", "")
file_dict["links"][date] = {}
filing_doc_href = row.find("a", {"id": "documentsbutton"})
if filing_doc_href is not None:
file_dict["links"][date]["documents"] = (
base_url_sec + filing_doc_href["href"]
)
filing_int_href = row.find("a", {"id": "interactiveDataBtn"})
if filing_int_href is not None:
file_dict["links"][date]["interactive_data"] = (
base_url_sec + filing_int_href["href"]
)
# print(file_dict)
ten_k = requests.get(
url=file_dict["links"][next(iter(file_dict["links"]))]["interactive_data"],
headers=headers,
)
ten_k_soup = bs(ten_k.content, "lxml")
ten_k_doc_href = ten_k_soup.find_all(string=re.compile("10-K"))[0].parent.parent["href"]
fin = base_url_sec + ten_k_doc_href
print(fin)