versionAlpha/webscraper.py at master · RRadiance/versionAlpha · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from bs4 import BeautifulSoup
from requests import get
import re
import time


class Webscraper:

    # Initialize the class
    def __init__(self):
        self.text = ''
        self.lst = []

    # Retrieves data from the NASDAQ Summary Page
    def get_data_nasdaq_summary(self, ticker: str) -> dict:
        start_time = time.time()
        url = 'https://www.nasdaq.com/symbol/'+ ticker
        response = get(url)
        # Create parse tree (BeautifulSoup Object)
        soup = BeautifulSoup(response.text, 'html.parser')
        data = soup.find_all(class_= 'column span-1-of-2')
        #print(len(data))
        #print(type(data))

        items = []
        # Extract table rows
        for i in range(len(data)):
            items.extend(data[i].find_all(class_='table-cell'))

        # Cleans up data
        for i in range(len(items)):
            # get_text strips the HTML tags
            items[i] = items[i].get_text(strip = True).encode\
                ('ASCII', 'ignore').decode('utf-8')
            # Gets rid of the extra ASCII characters, the
            # 'ignore' keyword means any errors in the encoding
            # will leave the character as a ''


        # Puts data into a dictionary
        d = {}
        for i in range(0, len(items), 2):
            d[items[i]] = items[i+1].replace(',','')
        # print(d)

        '''
        # Puts data into a list
        return_lst = []
        for i in range(0, len(items), 2):
            return_lst.append(items[i+1].replace(',',''))
        '''

        print('Elapsed time: ' + str(time.time() - start_time))
        return return_lst

    # Retrieves data from NASDAQ Income Statement Page
    def get_data_nasdaq_income_statement(self, ticker: str) -> tuple:
        start_time = time.time()
        url = 'https://www.nasdaq.com/symbol/'+ ticker + \
            '/financials?query=income-statement'
        response = get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        data = soup.find_all(class_= 'genTable')
        # data is a list of all tags with the name genTable
        # there is only one item in the data list
        # data[0] is a tag object. These objects have MANY methods

        '''
        # Attempted to use tag methods to gather data
        new = []
        for child in data[0].descendants:
            new.append(child)
        a = new[11]
        new_new = []
        for item in new:
            if isinstance(item, type(a)):
                if '$' in item.get_text():
                    new_new.append(item)
        for item in new_new:
            pass
        '''
        # text stores the relevant data in the form of a String
        # strip=True gets rid of the extra escape characters in the string (\n, etc)
        text = data[0].get_text(strip=True)
        # year1 stores the income statement data for the most recent year
        year1 = []
        year2 = []
        year3 = []
        year4 = []
        legend = ['Period Ending', 'Total Revenue', 'Gross Profit',\
                  'Research and Development' , 'Sales, General, and Admin,', \
                  'Non-Recurring Items', 'Other operating items', \
                  'Operating income', 'Addtnl income/expense items', \
                  'Earnings before interest and tax', 'Interest Expense', \
                  'Earnings Before Tax', 'Income Tax', 'Minority Interest', \
                  'Equity Earnings/Loss Unconsolidated Subsidiary', \
                  'Net income-cont. operations', 'Net income', \
                  'Net income appplicable to common shareholders']
        # print(len(legend))

        # Hardcoding method that uses regex, String methods, and index positions
        text = text.replace(',', '')
        # self.text = text
        pattern = re.compile(r'(\d*\d/\d\d/[2]\d\d\d)')
        # This pattern matches all the
        matches = pattern.findall(text)
        # print(matches)
        print('There is income statement data for ' + str(len(matches)) + \
              ' years')
        if (len(matches) % 4 != 0) or (len(matches) == 0):
            print('Incomplete income statement info. View webscraper.py (years regex)')
        else:
            # Adds data to the list
            year1.append(matches[0])
            year2.append(matches[1])
            year2.append(matches[2])
            year3.append(matches[3])
        '''
        split_text = text.split()
        period_ending = split_text[8]
        # Gets the period ending for the income statement
        try:
            year1.append(split_text[8][12:21])
            try:
                year2.append(split_text[8][21:30])
                try:
                    year3.append(split_text[8][30:39])
                    try:
                        year4.append(split_text[8][39:48])
                    except NameError:
                        print('The income statement for fourth most recent year DNE')
                        year4.append(-1) #-1 means it does not exist
                except NameError:
                    print('The income statement for the third most recent year DNE')
                    year3.append(-1)
            except NameError:
                    print('The income statement for the second most recent year DNE')
                    year2.append(-1)
        except NameError:
            print('The income statement for the most recent year does not exist')
            year1.append(-1)
        except Exception:
            print('There has been another type of error; investigate in webscraper.py')
        '''
        # Use Regular Expressions to get all dollar values into a list
        pattern = re.compile(r'\$(\d*)') #Keeps text starting with $, followed by digits
        matches = pattern.findall(text) #matches is now a list

        # Adds income statement values into appropiate year lists
        # WARNING: THIS ASSUMES THE COMPANY HAS INCOME STATEMENT INFORMATION FOR ALL 4 YEARS
        if (len(matches) % 4 != 0) or (len(matches) == 0):
            print('Incomplete income statement info. View webscraper.py \
            (values regex)')
        else:
            # Adds data to the lists
            counter = 0
            values = len(matches)
            while counter < values:
                year1.append(matches[counter])
                year2.append(matches[counter+1])
                year3.append(matches[counter+2])
                year4.append(matches[counter+3])
                counter += 4
        print('Successfully stored income statement information')
        # print(len(year1))

        print('Elapsed time: ' + str(time.time() - start_time))
        return_tuple = (year1, year2, year3, year4)
        return return_tuple
        """
        # Unused code, possible alternative method

        # Runs if income statement data exists for the most recent year
        if year1[0] != -1:
            pass
            #year1.append(split_text[9][0:0])

        # TODO: Append all the data into the yearX lists, use regex?
        total_revenue = split_text[9]
        cost_of_revenue = split_text[11]
        gross_profit = split_text[12]

        r_and_d = split_text[15]
        sales_general_admin = split_text[18]

        operating_income = split_text[22]
        additional_income_expense_items = split_text[24]
        earnings_before_interest_and_taxt = split_text[28]
        interest_expense = split_text[29]
        earnings_before_tax = split_text[31]
        income_tax = split_text[32]

        net_income = split_text[39]

        def extract_dollar_values(s: str) -> tuple:
            for char in s:
                pass
        """