SecParser/sec_parser.py at main · apexfund/SecParser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

base_url = r"https://www.sec.gov/cgi-bin/browse-edgar"
headers = {
    "User-Agent": "My User Agent 1.1",
    "From": "adgoch11@gmail.com",
}

param_dict = {
    "action": "getcompany",
    "CIK": "WMT",
    "type": "10-k",
    "owner": "exclude",
    "output": "XML",
    "count": "10",
}

response = requests.get(url=base_url, params=param_dict, headers=headers)
print(response.url)
soup = bs(response.content, "lxml")

print("Request Successful")

# define a base url that will be used for link building.
base_url_sec = r"https://www.sec.gov"

# find the document table with our data
doc_table = soup("table")[2]
doc_table = bs(str(doc_table), "html.parser")

# Locating the 10-k file links
strink = ""
for item in doc_table.find_all(string=re.compile("10-K")):
    strink += str(item.parent.parent.prettify())

ten_k_rows = bs(strink, "html.parser")
with open("row.txt", "w") as file:
    file.write(str(ten_k_rows.prettify()))


file_dict = {}
file_dict["file_type"] = param_dict["type"]
file_dict["ticker"] = param_dict["CIK"]
file_dict["links"] = {}
if len(ten_k_rows) != 0:

    for row in ten_k_rows.find_all("tr"):
        filing_date = str(
            row.find_all("td")[3].text
        ).strip()  # Extract date from the HTML directly
        date = str(filing_date).replace("-", "")
        file_dict["links"][date] = {}

        filing_doc_href = row.find("a", {"id": "documentsbutton"})
        if filing_doc_href is not None:
            file_dict["links"][date]["documents"] = (
                base_url_sec + filing_doc_href["href"]
            )

        filing_int_href = row.find("a", {"id": "interactiveDataBtn"})
        if filing_int_href is not None:
            file_dict["links"][date]["interactive_data"] = (
                base_url_sec + filing_int_href["href"]
            )

# print(file_dict)

ten_k = requests.get(
    url=file_dict["links"][next(iter(file_dict["links"]))]["interactive_data"],
    headers=headers,
)
ten_k_soup = bs(ten_k.content, "lxml")
ten_k_doc_href = ten_k_soup.find_all(string=re.compile("10-K"))[0].parent.parent["href"]

fin = base_url_sec + ten_k_doc_href
print(fin)