diff --git a/src/concall_tools.py b/src/concall_tools.py index e347815..5f8736f 100644 --- a/src/concall_tools.py +++ b/src/concall_tools.py @@ -1,10 +1,19 @@ import fitz +import enchant -from speakers.extraction import Speaker + +from speakers.extraction import ( + Speaker, + Speaker_with_is_management +) from speakers.extraction import ( get_speakers_from_text as _get_speakers_from_text, + get_speakers_in_bold as _get_speakers_in_bold, + get_speakers_capitals as _get_speakers_capitals, + get_lines as _get_lines, + remove_last_char_if_colon as _remove_last_char_if_colon ) -from speakers.extraction import get_speakers_in_bold as _get_speakers_in_bold + def get_speakers(pdf_name, algorithm="auto"): @@ -20,13 +29,58 @@ def get_speakers(pdf_name, algorithm="auto"): - plain - auto """ - if algorithm not in ["bold", "plain", "auto"]: - raise ValueError("algorithm must be one of 'bold', 'plain', 'auto'") - + if algorithm not in ["bold", "plain", "auto", "Capitals"]: + raise ValueError("algorithm must be one of 'bold', 'plain', 'auto' ") doc = fitz.open(pdf_name) speakers = [] + speakers_common = [] + names_from_plain_bold = [] if algorithm == "auto" or algorithm == "bold": speakers = _get_speakers_in_bold(doc) if not speakers and (algorithm == "auto" or algorithm == "plain"): speakers = _get_speakers_from_text(doc) - return speakers + if algorithm in ["bold",'plain']: + return speakers + else: + speakers_in_capital = _get_speakers_capitals(pdf_name) + for name,firm in speakers: + names_from_plain_bold.append(name) + for speaker in speakers_in_capital: + if speaker.name in names_from_plain_bold: + speakers_common.append(speaker) + else: + if speaker.firm!=None: + speakers_common.append(speaker) + return speakers_common + +def get_conversations(pdf_name): + speakers_common=get_speakers(pdf_name) + names=[] + conversation=[] + conversation_with_speaker=[] + order_of_speakers=[] + lines=[] + s='' + lines=_get_lines(pdf_name) + for speaker in speakers_common: + names.append(speaker[0]) + flag=0 + for line in lines: + line=_remove_last_char_if_colon(line) + if line in names: + order_of_speakers.append(line) + if line not in names: + s = s + line + ' ' + else: + if flag == 1: + conversation.append(s) + flag=1 + s='' + if s!='': + conversation.append(s) + for i in range(len(order_of_speakers)): + for speaker in speakers_common: + if speaker[0]==order_of_speakers[i]: + conversation_with_speaker.append((speaker,conversation[i])) + return conversation_with_speaker + \ No newline at end of file diff --git a/src/speakers/extraction.py b/src/speakers/extraction.py index 747d077..f9bbb77 100644 --- a/src/speakers/extraction.py +++ b/src/speakers/extraction.py @@ -3,8 +3,12 @@ import lxml.html import nltk +import fitz +import enchant +english_dictionary = enchant.Dict("en_US") Speaker = namedtuple("Speaker", ["name", "firm"]) +Speaker_with_is_management = namedtuple("Speaker", ["name", "firm", "is_management"]) def _get_pages(doc): @@ -29,7 +33,7 @@ def _get_main_pages(doc): word_counts = [page.word_count for page in pages] median = sorted(word_counts)[len(word_counts) // 2] # main pages are pages with at-least half as much text - main_pages = [page for page in pages if page.word_count > (median / 3)] + main_pages = [page for page in pages if page.word_count > (median / 4) and page.number!=0] return main_pages @@ -329,3 +333,379 @@ def get_speakers_in_bold(doc): if speaker not in speakers: speakers.append(speaker) return speakers + + +def get_lines(doc): + doc = fitz.open(doc) + pages = _get_main_pages(doc) + text = "\n".join([page.text for page in pages]) + text = text.replace("Pvt.", "Pvt") + text = text.replace(":",":\n") + lines = [l.strip() for l in text.splitlines() if l.strip()] + return lines + +# If most of the lines containing names end with ':', This fucntion uses that condition to filter out lines that do not end with ':' +def check_if_last_char_colon(names): + count = 0 + names_copy = [] + for name in names: + if name[-1]==':': + count = count + 1 + names_copy = names + names = [] + for name in names_copy: + # If more than 40% of lines that could be names end with ':', filter lines that do not end with ':' + if count/len(names_copy)>0.40: + if name[-1]==':': + names.append(name) + # Else filter lines that do end with ':', also filtering lines ending with '.' as they don't occur after a name + elif name[-1]!=':' and name[-1]!='.': + names.append(name) + return names + + +def remove_last_char_if_colon(word): + if word[-1]==':' and len(word)!=0: + word = word[0:len(word)-1] + return word + +# Below fucntion is to check how many words in the names to be filtered belong to a english dictionary and eliminate accordingly. +def check_english_dict(names): + names_copy = names + names = [] + possible_names_in_english_dictionary = ['Shah','Moderator','moderator','Raj','Participant'] + names_not_possible=['Kfin'] + for name in names_copy: + count = 0 + for word in name.split(): + word = remove_last_char_if_colon(word) + if english_dictionary.check(word) and word not in possible_names_in_english_dictionary: + count = count+1 + # Condition added to deal with false positives + if name in names_not_possible: + count = count+1 + # Below condtion only accepts names that have less than 50% of the words belonging to a dictionary + if count/len(name.split())<=0.50: + names.append(name) + return names + + +def check_repetations(names): + names_copy = names + for name_outer in names: + for name_inner in names: + if name_outer.find(name_inner)!=-1 and len(name_outer)>len(name_inner): + names_copy.remove(name_inner) + names = names_copy + return(names) + + +def get_speaker_names(doc): + lines = get_lines(doc) + char_not_in_names = [',','-','&','?'] + names = [] + order_of_speakers = [] + for line in lines: + is_name = 0 + words = line.split() + if len(words)<=5: + # Below condition eliminates lines that have only 1 word and contain less than four letters + if len(words)==1 and len(words[0])<4: + is_name = 1 + for word in words: + if not word[0].isupper(): + is_name = 1 + for letter in word: + if letter.isnumeric() or letter in char_not_in_names: + is_name = 1 + if is_name==0: + if line not in names: + names.append(line) + names = check_english_dict(names) + names = check_if_last_char_colon(names) + names = check_repetations(names) + for line in lines: + if line in names: + order_of_speakers.append(line) + # Order_of_speakers here has the repeated names of people in the order they speak. + return [names,lines,order_of_speakers] + + +def get_conversation(doc): + names,lines,_ = get_speaker_names(doc) + s = '' + conversation = [] + for line in lines: + if line not in names: + s = s + line + ' ' + else: + conversation.append(s) + s = '' + if s != '': + conversation.append(s) + return (conversation) + +# Modify str takes the conversation as variable l, makes some modifications to extract names and speakers and returns the words +def modify_str(l): + str = l.replace('.','. ') + l = l.replace(',',' , ') + l = l.replace('from','from ') + l = l.replace('(','') + l = l.replace(')','.') + w = l.split() + return w + +# Function tries to find that paragraph where the full name occurs +def find_count(name,w,count_max): + flag = 0 + position = 0 + count = 0 + name_index = 0 + for word in name.split(): + word = remove_last_char_if_colon(word) + if word in w: + if flag==0: + name_index = w.index(word) + flag = 1 + count = count+1 + elif flag==0: + position = position+1 + if count>count_max: + count_max = count + return (count,count_max,position,name_index) + + +def check_if_firm(firm): + count = 0 + # If we Identify a firm that has more than 4 words and all of the words belong to a English Dictionary, + # We discard it from being a firm + if len(firm.split())>4: + for word in firm.split(): + if english_dictionary.check(word): + count = count+1 + if count/len(firm.split())>=0.88: + firm = '' + if firm=='Thank you' or firm=='Sir': + firm = '' + if firm != '': + if firm[len(firm)-1]==' ': + firm=firm[0:len(firm)-1] + return firm + +def pass_1(names,conversation,order_of_speakers): + final = {} + # First value in conversation contains lines before any speaker starts, hence we filter it out + conversation = conversation[1:len(conversation)] + words_not_found_as_first_word_of_firm = ['Sir,','I','Mr.','Ms.','Sir'] + for name in names: + name = remove_last_char_if_colon(name) + # Count_max stores the maximum number of words from a name that are Identified in any paragraph + count_max = 0 + for i in range(len(conversation)): + if order_of_speakers[i]=='Moderator:' or order_of_speakers[i]=='Moderator' or order_of_speakers[i]=='Operator' or order_of_speakers[i]=='Operator:': + count = 0 + l = conversation[i] + # Modify_str takes the conversation, makes some changes and returns the words to w + w = modify_str(l) + # Postion is the postion of the word identified in name + # Name_index is the postion of the word identified in w + count,count_max,position,name_index = find_count(name,w,count_max) + # If more than 50% of words in a name are Identified, + # And words Identified are less than max Identified till now, we search for the firm + if count/len(name.split())>=0.5 and count>=count_max: + #flag_2 is used to check if We've found any work Starting with a Capital + #flag_3 is used to check if We've reached end of Conversation while adding names to the firm + flag_2 = 0 + flag_3 = 0 + firm = '' + # C points to the index of the word after the name, + # It points to name_index(position of word in w)+lenght of name(in term of words)-the postion + # Of the word identified in the name + for c in range(name_index+len(name.split())-position,len(w)): + if ((w[c][0].isupper() or w[c]=='individual') and flag_2==0 and w[c] not in words_not_found_as_first_word_of_firm): + flag_2 = 1 + b=((w[c][-1]=='.' or w[c][-1]==',')) + while not b: + firm = firm+w[c]+' ' + c = c+1 + if c==len(w): + for letter in firm: + if letter.isnumeric(): + firm = '' + flag_3 = 1 + break + b = ((w[c][-1]=='.' or w[c][-1]==',')) + if flag_3==0: + firm = firm+w[c] + firm = firm[0:len(firm)-1] + firm = check_if_firm(firm) + if name[-1]==':': + name = name[0:len(name)-1] + final[name] = firm + return final + + +def pass_2(names,conversation,order_of_speakers,final): + # First value in conversation contains lines before any speaker starts, hence we filter it out + conversation = conversation[1:len(conversation)] + words_not_found_as_first_word_of_firm = ['Sir,','I','Mr.','Ms.','Sir'] + for name in names: + # Count_max stores the maximum number of words from a name that are Identified in any paragraph + count_max = 0 + name = remove_last_char_if_colon(name) + if name not in final.keys() or final[name]=='': + for i in range(len(conversation)): + order_of_speakers[i] = remove_last_char_if_colon(order_of_speakers[i]) + if order_of_speakers[i]==name: + count = 0 + l = conversation[i] + l = conversation[i] + # Modify_str takes the conversation, makes some changes and returns the words to w + w = modify_str(l) + # Postion is the postion of the word identified in name + # Name_index is the postion of the word identified in w + count,count_max,position,name_index = find_count(name,w,count_max) + # If more than 33%% of words in a name are Identified, + # And words Identified are less than max Identified till now, we search for the firm + if count/len(name.split())>=0.33 and count>=count_max: + # Flag_2 is used to check if We've found any work Starting with a Capital + # Flag_3 is used to check if We've reached end of Conversation while adding names to the firm + flag_2 = 0 + flag_3 = 0 + firm = '' + # C points to the index of the word after the name, + # It points to name_index(position of word in w)+lenght of name(in term of words)-the postion + # Of the word identified in the name + for c in range(name_index+len(name.split())-position,len(w)): + if ((w[c][0].isupper() or w[c]=='individual') and flag_2==0 and w[c] not in words_not_found_as_first_word_of_firm): + flag_2 = 1 + b=((w[c][-1]=='.' or w[c][-1]==',')) + while not b: + firm = firm+w[c]+' ' + c = c+1 + if c==len(w): + for letter in firm: + if letter.isnumeric(): + firm = '' + flag_3 = 1 + break + b = ((w[c][-1]=='.' or w[c][-1]==',')) + if flag_3==0: + firm = firm+w[c] + firm = firm[0:len(firm)-1] + firm = check_if_firm(firm) + if name[-1]==':': + name = name[0:len(name)-1] + final[name] = firm + return final + + +def pass_3(names,conversation,order_of_speakers,final): + words_not_found_as_first_word_of_firm = ['Sir,','I','Mr.','Ms.','Sir'] + for name in names: + # Count_max stores the maximum number of words from a name that are Identified in any paragraph + count_max = 0 + name = remove_last_char_if_colon(name) + if name not in final.keys() or final[name]=='': + for i in range(3): + if name not in final.keys() or final[name]=='': + count = 0 + l = conversation[i] + # Modify_str takes the conversation, makes some changes and returns the words to w + w = modify_str(l) + # Postion is the postion of the word identified in name + # Name_index is the postion of the word identified in w + count,count_max,position,name_index = find_count(name,w,count_max) + # If more than 50% of words in a name are Identified, + # And words Identified are less than max Identified till now, we search for the firm + #print(name,count,count_max) + if count/len(name.split())>=0.60 and count>=count_max: + #flag_2 is used to check if We've found any work Starting with a Capital + #flag_3 is used to check if We've reached end of Conversation while adding names to the firm + flag_2 = 0 + flag_3 = 0 + firm = '' + # C points to the index of the word after the name, + # it points to name_index(position of word in w)+lenght of name(in term of words)-the postion + # Of the word identified in the name + for c in range(name_index+len(name.split())-position,len(w)): + if ((w[c][0].isupper() or w[c]=='individual') and flag_2==0 and w[c]not in words_not_found_as_first_word_of_firm): + flag_2 = 1 + b = ((w[c][-1]=='.' or w[c][-1]==',')) + while not b: + firm = firm+w[c]+' ' + c = c+1 + if c==len(w): + for letter in firm: + if letter.isnumeric(): + firm = '' + flag_3 = 1 + break + b = ((w[c][-1]=='.' or w[c][-1]==',' or w[c][-1]==';' or w[c]=='and' or w[c]=='May' or w[c]=='Mr.')) + if flag_3==0 and w[c]!='and' and w[c]!='May' and w[c]!='Mr.': + firm = firm+w[c] + firm = firm[0:len(firm)-1] + firm = check_if_firm(firm) + for letter in firm: + if letter.isnumeric(): + firm = '' + if name[-1]==':': + name = name[0:len(name)-1] + final[name] = firm + return final + +def check_if_management(final,is_management,order_of_speakers): + keywords_in_management_designations=['Chief','Officer','Manager','Executive','Director','Chairman','Group','Head','CEO','CFO','COO','Secretary','GM','MD','Corporate','Communications','Relation'] + name_count={} + name_order=[] + for name in final.keys(): + name_count[name]=0 + word_count = 0 + is_management[name]='No' + firm=final[name] + if firm!=None: + firm=firm.replace('-',' - ') + for word in firm.split(): + if word in keywords_in_management_designations: + word_count = word_count + 1 + if word_count/len(final[name].split())>=0.25: + is_management[name]='Yes' + #print(name,word_count,is_management[name]) + for name in order_of_speakers: + name=remove_last_char_if_colon(name) + name_count[name] = name_count[name] + 1 + name_count = dict(sorted(name_count.items(), key=lambda item: item[1], reverse=True)) + for name in name_count.keys(): + name_order.append(name) + for i in range(5): + name=name_order[i] + if final[name] == None and name != 'Moderator': + is_management[name]='Yes' + return is_management + +def get_speakers_capitals(doc): + names,lines,order_of_speakers = get_speaker_names(doc) + conversation = get_conversation(doc) + final = {} + is_management = {} + # Pass 1 checks if the moderator announces any names + final = pass_1(names,conversation,order_of_speakers) + # Pass 2 checks if incase moderator has not introduced, if the speaker introduces himself + final = pass_2(names,conversation,order_of_speakers,final) + # For speakers whose firm is still unknown, we check the first 3 conversations to check if they are introduced as management + final = pass_3(names,conversation,order_of_speakers,final) + for name in names: + name = remove_last_char_if_colon(name) + if name not in final.keys() or final[name]=='': + final[name] = None + is_management = check_if_management(final,is_management,order_of_speakers) + speakers = [] + names_copy = [] + # Names_copy is created so that the order in which speakers speak is mantained in the final tuple + for name in names: + name = remove_last_char_if_colon(name) + names_copy.append(name) + for name in names_copy: + speaker = Speaker_with_is_management(name = name, firm = final[name], is_management = is_management[name]) + speakers.append(speaker) + return speakers \ No newline at end of file diff --git a/test_files/eng-india.pdf b/test_files/eng-india.pdf new file mode 100644 index 0000000..edc9013 Binary files /dev/null and b/test_files/eng-india.pdf differ diff --git a/test_files/gsfc.pdf b/test_files/gsfc.pdf new file mode 100644 index 0000000..001dca6 Binary files /dev/null and b/test_files/gsfc.pdf differ diff --git a/test_files/hind-zinc.pdf b/test_files/hind-zinc.pdf new file mode 100644 index 0000000..079968a Binary files /dev/null and b/test_files/hind-zinc.pdf differ diff --git a/test_files/tcs.pdf b/test_files/tcs.pdf new file mode 100644 index 0000000..16b0f47 Binary files /dev/null and b/test_files/tcs.pdf differ diff --git a/tests/test_extract_speakers.py b/tests/test_extract_speakers.py index 38c4e0f..4dcdc22 100644 --- a/tests/test_extract_speakers.py +++ b/tests/test_extract_speakers.py @@ -1,23 +1,24 @@ import unittest from unittest import TestCase -from src.concall_tools import Speaker, get_speakers - +from src.concall_tools import Speaker_with_is_management, get_speakers +Speaker=Speaker_with_is_management class ExtractionTestCases(TestCase): def test_extract_speakers_aimco(self): pdf = "test_files/aimco-concall.pdf" speakers = get_speakers(pdf) expected = [ - Speaker(name="Moderator", firm=None), - Speaker(name="Sayam Pokharna", firm="Investment Lab"), - Speaker(name="Ashit Dave", firm=None), - Speaker(name="Navid Virani", firm="Bastion Capital"), - Speaker(name="Ravi Sundaram", firm="Sundaram Family"), - Speaker(name="Parth Agarwal", firm=None), - Speaker(name="Vidya Verma", firm=None), - Speaker(name="Nitin Gandhi", firm="KIFS Trade Capital"), - Speaker(name="Milan Shah", firm="Urmil Research Consultancy"), + Speaker(name="Moderator", firm=None, is_management="No"), + Speaker(name="Sayam Pokharna", firm="The Investment Lab",is_management="No"), + # Ashit Dave is Executive Director and CFO, Missing CFO + Speaker(name="Ashit Dave", firm="Executive Director", is_management="Yes"), + Speaker(name="Navid Virani", firm="Bastion Capital", is_management="No"), + Speaker(name="Ravi Sundaram", firm="Sundaram Family", is_management="No"), + Speaker(name="Parth Agarwal", firm='individual investor', is_management="No"), + Speaker(name="Vidya Verma", firm='individual investor', is_management="No"), + Speaker(name="Nitin Gandhi", firm="KIFS Trade Capital", is_management="No"), + Speaker(name="Milan Shah", firm="Urmil Research Consultancy", is_management="No"), ] self.maxDiff = None self.assertEqual(speakers, expected) @@ -26,21 +27,20 @@ def test_extract_speakers_gpil(self): pdf = "test_files/gpil-concall.pdf" speakers = get_speakers(pdf) expected = [ - Speaker(name="Moderator", firm=None), - Speaker(name="Ankit Toshniwal", firm="Go India Advisors"), - Speaker(name="Dinesh Gandhi", firm="GPIL"), - Speaker(name="Niteen Dharmavat", firm="Aurum Capital"), - Speaker(name="Abhishek Agrawal", firm="Executive Director"), - Speaker(name="Vikas Singh", firm="Philip Capital"), - Speaker(name="Yogansh Jeswani", firm="Mittall Analytics"), - Speaker(name="AM Lodha", firm="Sanmati Consultants"), - Speaker(name="Ayush Mittal", firm="MAPL Value Investing Fund"), - Speaker(name="Pritesh Chheda", firm="Lucky investment Managers"), - Speaker(name="Anurag Patil", firm="Roha Asset Managers"), - Speaker(name="Parthiv Shah", firm="Tracom Stock Brokers"), - Speaker(name="Sanjay Bothra", firm=None), - # A and R International isn't parsed currently - Speaker(name="Utsav Chhawchharia", firm="A and R International"), + Speaker(name="Moderator", firm=None, is_management="No"), + Speaker(name="Ankit Toshniwal", firm="Go India Advisors", is_management="No"), + Speaker(name="Dinesh Gandhi", firm="Director", is_management="Yes"), + Speaker(name="Niteen Dharmavat", firm="Aurum Capital", is_management="No"), + Speaker(name="Abhishek Agrawal", firm="Executive Director", is_management="Yes"), + Speaker(name="Vikas Singh", firm="Philip Capital", is_management="No"), + Speaker(name="Yogansh Jeswani", firm="Mittall Analytics", is_management="No"), + Speaker(name="AM Lodha", firm="Sanmati Consultants", is_management="No"), + Speaker(name="Ayush Mittal", firm="MAPL Value Investing Fund", is_management="No"), + Speaker(name="Pritesh Chheda", firm="Lucky investment Managers", is_management="No"), + Speaker(name="Anurag Patil", firm="Roha Asset Managers", is_management="No"), + Speaker(name="Parthiv Shah", firm="Tracom Stock Brokers", is_management="No"), + Speaker(name="Sanjay Bothra", firm='CFO', is_management="Yes"), + Speaker(name="Utsav Chhawchharia", firm="A and R International", is_management="No"), ] self.maxDiff = None self.assertEqual(speakers, expected) @@ -49,11 +49,12 @@ def test_extract_speakers_tata_motors(self): pdf = "test_files/tata-motor.pdf" speakers = get_speakers(pdf) expected = [ - Speaker(name="PB Balaji", firm="Group CFO"), - Speaker(name="Adrian Mardell", firm="CFO"), - Speaker(name="Girish Wagh", firm="Executive Director"), - Speaker(name="Shailesh Chandra", firm="MD"), - Speaker(name="Thierry Bolloré", firm="CEO Jaguar Land Rover"), + #Rearranged Order + Speaker(name="PB Balaji", firm="GROUP CFO", is_management="Yes"), + Speaker(name="Girish Wagh", firm="Executive Director", is_management="Yes"), + Speaker(name="Shailesh Chandra", firm="MD", is_management="Yes"), + Speaker(name="Adrian Mardell", firm="CFO", is_management="Yes"), + Speaker(name="Thierry Bolloré", firm="CEO Jaguar Land Rover", is_management="Yes"), ] self.maxDiff = None self.assertEqual(speakers, expected) @@ -62,20 +63,21 @@ def test_extract_speakers_lt(self): pdf = "test_files/lt.pdf" speakers = get_speakers(pdf) expected = [ - Speaker(name="Moderator", firm=None), - Speaker(name="P. Ramakrishnan", firm=None), - Speaker(name="Renu Baid", firm="IIFL"), - Speaker(name="Sumit Kishore", firm="Axis Capital"), + Speaker(name="Moderator", firm=None, is_management="No"), + #P. Ramakrishnan is actually 'Head,', 'Investor', 'Relations,', 'Larsen', '&', 'Toubro', 'Limited.' + Speaker(name="P. Ramakrishnan", firm="Head", is_management="Yes"), + Speaker(name="Renu Baid", firm="IIFL", is_management="No"), + Speaker(name="Sumit Kishore", firm="Axis Capital", is_management="No"), # Mohit Kumar is from DAM Capital # but it is missed as FROM is written as FORM in pdf - Speaker(name="Mohit Kumar", firm=None), - Speaker(name="Ankur Sharma", firm="HDFC Life"), - Speaker(name="Puneet Gulati", firm="HSBC"), - Speaker(name="Nitin Arora", firm="Axis Mutual Fund"), - Speaker(name="Parikshit Khandpal", firm="HDFC Securities"), - Speaker(name="Amish Shah", firm="Bank of America Securities"), - Speaker(name="Ashish Shah", firm="Centrum Broking"), - Speaker(name="Kirti Jain", firm="Canara HSBC Life"), + Speaker(name="Mohit Kumar", firm="DAM Capital", is_management="No"), + Speaker(name="Ankur Sharma", firm="HDFC Life", is_management="No"), + Speaker(name="Puneet Gulati", firm="HSBC", is_management="No"), + Speaker(name="Nitin Arora", firm="Axis Mutual Fund", is_management="No"), + Speaker(name="Parikshit Khandpal", firm="HDFC Securities", is_management="No"), + Speaker(name="Amish Shah", firm="Bank of America Securities", is_management="No"), + Speaker(name="Ashish Shah", firm="Centrum Broking", is_management="No"), + Speaker(name="Kirti Jain", firm="Canara HSBC Life", is_management="No"), ] self.maxDiff = None self.assertEqual(speakers, expected) @@ -84,18 +86,18 @@ def test_extract_speakers_avanti(self): pdf = "test_files/avanti.pdf" speakers = get_speakers(pdf) expected = [ - Speaker(name="C. Ramachandra Rao", firm="Joint Managing Director"), - Speaker(name="Moderator", firm=None), - Speaker(name="Nitin Awasti", firm="Incread research"), - Speaker(name="Sri C Ramachandra Rao", firm=None), - Speaker(name="Sri Muthyam Reddy", firm=None), + Speaker(name="Moderator", firm=None, is_management="No"), + Speaker(name="C. Ramachandra Rao", firm="Joint Managing Director", is_management="Yes"), + Speaker(name="Nitin Awasti", firm="Incread research", is_management="No"), + Speaker(name="Sri C Ramachandra Rao", firm=None, is_management="Yes"), + Speaker(name="Sri Muthyam Reddy", firm=None, is_management="No"), # Onkar Ghugadare is from Sree Investment # But it is missed because of spelling error in Ghugadre - Speaker(name="Onkar Ghugadare", firm=None), - Speaker(name="Sri. Alluri Nikhilesh", firm=None), - Speaker(name="Vinayak Mohta", firm="Stallion Asset"), - Speaker(name="Depesh Kashyap", firm="Equirus Capital"), - Speaker(name="Ayush Mittal", firm="Mittal analytics"), + Speaker(name="Onkar Ghugadare", firm='Sree investment' , is_management="No"), + Speaker(name="Sri. Alluri Nikhilesh", firm='Executive Director - Avanti Frozen Foods Pvt Ltd', is_management="Yes"), + Speaker(name="Vinayak Mohta", firm="Stallion Asset", is_management="No"), + Speaker(name="Depesh Kashyap", firm="Equirus Capital", is_management="No"), + Speaker(name="Ayush Mittal", firm="Mittal analytics", is_management="No"), ] self.maxDiff = None self.assertEqual(speakers, expected) @@ -104,15 +106,15 @@ def test_extract_speakers_hdfc(self): pdf = "test_files/hdfc-concall.pdf" speakers = get_speakers(pdf) expected = [ - Speaker(name="Moderator", firm=None), - Speaker(name="Srinivasan V", firm=None), - Speaker(name="Mahrukh Adajania", firm="Edelweiss"), - Speaker(name="Rahul Jain", firm="Goldman Sachs"), - Speaker(name="Aditya Jain", firm="Citigroup"), - Speaker(name="Manish Shukla", firm="Axis Capital"), - Speaker(name="Sagar Doshi", firm=None), - Speaker(name="Adarsh Parasrampuria", firm="CLSA"), - Speaker(name="Saurabh", firm="JP Morgan"), + Speaker(name="Moderator", firm=None, is_management="No"), + Speaker(name="Srinivasan V", firm="Chief Financial Officer", is_management="Yes"), + Speaker(name="Mahrukh Adajania", firm="Edelweiss", is_management="No"), + Speaker(name="Rahul Jain", firm="Goldman Sachs", is_management="No"), + Speaker(name="Aditya Jain", firm="Citigroup", is_management="No"), + Speaker(name="Manish Shukla", firm="Axis Capital", is_management="No"), + Speaker(name="Sagar Doshi", firm="Individual Investor", is_management="No"), + Speaker(name="Adarsh Parasrampuria", firm="CLSA", is_management="No"), + Speaker(name="Saurabh", firm="JP Morgan", is_management="No"), ] self.maxDiff = None self.assertEqual(speakers, expected) @@ -121,19 +123,19 @@ def test_extract_speakers_asian_paints(self): pdf = "test_files/asian-paints.pdf" speakers = get_speakers(pdf) expected = [ - Speaker(name="Moderator", firm=None), - Speaker(name="Arun Nair", firm="Corporate Communications"), - Speaker(name="Amit Syngle", firm=None), - Speaker(name="Avi Mehta", firm="Macquarie"), - Speaker(name="Abneesh Roy", firm="Edelweiss"), - Speaker(name="Shirish Pardeshi", firm="Centrum"), - Speaker(name="Parag Rane", firm="GM-Finance May"), - Speaker(name="Saumil Mehta", firm="Kotak Life"), - Speaker(name="Richard Liu", firm="JM Financial"), - Speaker(name="Varun Singh", firm="IDBI Capital"), - Speaker(name="Amit Sachdeva", firm="HSBC Securities"), - Speaker(name="Percy Panthaki", firm="IIFL Securities"), - Speaker(name="Sujay Kamath", firm="Millenium Partners"), + Speaker(name="Moderator", firm=None, is_management='No'), + Speaker(name="Arun Nair", firm="Corporate Communications", is_management="Yes"), + Speaker(name="Amit Syngle", firm="MD & CEO", is_management="Yes"), + Speaker(name="Avi Mehta", firm="Macquarie", is_management="No",), + Speaker(name="Abneesh Roy", firm="Edelweiss", is_management="No"), + Speaker(name="Shirish Pardeshi", firm="Centrum", is_management="No"), + Speaker(name="Parag Rane", firm="GM-Finance", is_management="Yes"), + Speaker(name="Saumil Mehta", firm="Kotak Life", is_management="No"), + Speaker(name="Richard Liu", firm="JM Financial", is_management="No"), + Speaker(name="Varun Singh", firm="IDBI Capital", is_management="No"), + Speaker(name="Amit Sachdeva", firm="HSBC Securities", is_management="No"), + Speaker(name="Percy Panthaki", firm="IIFL Securities", is_management="No"), + Speaker(name="Sujay Kamath", firm="Millenium Partners", is_management="No"), ] self.maxDiff = None self.assertEqual(speakers, expected) @@ -142,33 +144,133 @@ def test_extract_speakers_sandur(self): pdf = "test_files/sandur-concall.pdf" speakers = get_speakers(pdf) expected = [ - Speaker(name="Moderator", firm=None), - Speaker(name="Bahirji Ghorpade", firm=None), - Speaker(name="Ayush Agarwal", firm="Mittal Analytics"), - Speaker(name="Shubham Agarwal", firm="Equitas Investments"), - Speaker(name="Abhay Lodha", firm=None), - Speaker(name="Abhishek Maheshwari", firm=None), - Speaker(name="Rahul Jain", firm=None), - Speaker(name="Kamal Gupta", firm=None), - Speaker(name="Ramesh Kumar Jain", firm=None), - Speaker(name="Ayush Mittal", firm="Mittal Analytics"), - Speaker(name="Ashok Kumar", firm=None), - Speaker(name="Yachna Bhatia", firm=None), - Speaker(name="Sahil Sanghvi", firm="Monarch Networth Capital"), - Speaker(name="Mayur Shah", firm=None), - Speaker(name="Rajesh Agarwal", firm=None), - Speaker(name="Abdul Saleem", firm=None), + Speaker(name="Moderator", firm=None, is_management='No'), + Speaker(name="Bahirji Ghorpade", firm="Managing Director", is_management="Yes"), + Speaker(name="Ayush Agarwal", firm="Mittal Analytics", is_management="No"), + Speaker(name="Shubham Agarwal", firm="Equitas Investments", is_management="No"), + Speaker(name="Abhay Lodha", firm=None, is_management="No"), + Speaker(name="Abhishek Maheshwari", firm=None, is_management="No"), + Speaker(name="Rahul Jain", firm="Systematix", is_management="No"), + Speaker(name="Kamal Gupta", firm=None, is_management="No"), + #Ramesh kumar Jain is "Chartered accountant here from Banglore" which is not identified as a firm + Speaker(name="Ramesh Kumar Jain", firm=None, is_management="No"), + Speaker(name="Ayush Mittal", firm="Mittal Analytics", is_management="No"), + Speaker(name="Ashok Kumar", firm=None, is_management="No"), + Speaker(name="Yachna Bhatia", firm=None, is_management="No"), + Speaker(name="Sahil Sanghvi", firm="Monarch Networth Capital", is_management="No"), + Speaker(name="Mayur Shah", firm='Anand Rathi Portfolio Management team', is_management="No"), + Speaker(name="Rajesh Agarwal", firm=None, is_management="No"), + Speaker(name="Abdul Saleem", firm='Director Mines', is_management="Yes"), # sachin sanu is not found as his appearance is only once - # Speaker(name="Sachin Sanu", firm=None), - Speaker(name="Manoj Dua", firm=None), - Speaker(name="Bach Raj Nahar", firm=None), - Speaker(name="Arpit Ranka", firm=None), - Speaker(name="Jitendra Anchalia", firm=None), - Speaker(name="Sanjay Jain", firm=None), - Speaker(name="Hardik Jain", firm=None), - Speaker(name="Prashanth Shah", firm=None), - Speaker(name="Participant", firm=None), - Speaker(name="Satish Kumar", firm=None), + Speaker(name="Sachin Sanu", firm='Chief Financial Officer', is_management="Yes"), + Speaker(name="Manoj Dua", firm=None, is_management="No"), + Speaker(name="Bach Raj Nahar", firm=None, is_management="No"), + #because Abdul is spelt incorreting to Abbdul somewhere + #Speaker(name='Abbdul Saleem', firm='Director Mines.', is_management="Yes"), + Speaker(name="Arpit Ranka", firm='Investments', is_management="No"), + Speaker(name="Jitendra Anchalia", firm=None, is_management="No"), + Speaker(name="Sanjay Jain", firm=None, is_management="No"), + Speaker(name="Hardik Jain", firm=None, is_management="No"), + Speaker(name="Prashanth Shah", firm=None, is_management="No"), + Speaker(name="Participant", firm=None, is_management="No"), + Speaker(name="Satish Kumar", firm=None, is_management="No"), + ] + self.maxDiff = None + self.assertEqual(speakers, expected) + + def test_extract_speakers_hind_zinc(self): + pdf = "test_files/hind-zinc.pdf" + speakers = get_speakers(pdf) + expected = [ + # Hindustan is a false positive + #Speaker(name="Hindustan", firm=None), + Speaker(name="Moderator", firm=None, is_management="No"), + Speaker(name="Shweta Arora", firm='Head of Investor Relations', is_management="Yes"), + #following designation is wrong as persons designation is mentioned before the name + Speaker(name="Arun Misra", firm='Interim CFO', is_management="Yes"), + Speaker(name="Sandeep Modi", firm=None, is_management="Yes"), + Speaker(name="Amit Dixit", firm="Edelweiss", is_management="No"), + Speaker(name="Anuj Singla", firm="Bank of America", is_management="No"), + Speaker(name="Abhiram Iyer", firm="Deutsche CIB Center", is_management="No"), + Speaker( + name="Vishal Chandak", firm="Motilal Oswal Financial Services", is_management="No" + ), + # Vishal Chandak is written as Visha Chandak (the l is not in bold) + #Speaker(name="Visha", firm=None), + Speaker(name="Vikash Singh", firm="Phillip Capital", is_management="No"), + Speaker(name="Ritesh Shah", firm="Investec", is_management="No"), + Speaker(name="Abhijit Mitra", firm="ICICI Securities", is_management="No"), + Speaker( + name="Pallav Agarwal", firm="Antique Stock Broking Limited", is_management="No" + ), + Speaker(name="Rahul Jain", firm="Systematix", is_management="No"), + Speaker(name="Saket Reddy", firm="Polsani Enterprises", is_management="No"), + ] + self.maxDiff = None + self.assertEqual(speakers, expected) + + def test_extract_speakers_eng_india(self): + pdf = "test_files/eng-india.pdf" + speakers = get_speakers(pdf) + expected = [ + Speaker(name="Moderator", firm=None, is_management="No"), + Speaker( + name="Kunal Sheth", + firm="Batlivala &Karani Securities India Private Limited", is_management="No" + ), + Speaker(name="Vartika Shukla", firm='Chairman', is_management="Yes"), + Speaker(name="Vinay Kalia", firm="Chief General Manager", is_management="Yes"), + Speaker(name="Saket Kapoor", firm="Kapoor Industries", is_management="No"), + Speaker(name="R.P. Batra", firm='Group General Manager', is_management="Yes"), + Speaker(name="Viral Shah", firm="YES Securities", is_management="No"), + Speaker( + name="Sagar Gandhi", + firm="Future Generali India Life Insurance Company", is_management="No" + ), + ] + self.maxDiff = None + self.assertEqual(speakers, expected) + + def test_extract_speakers_gsfc(self): + pdf = "test_files/gsfc.pdf" + speakers = get_speakers(pdf) + expected = [ + Speaker(name="V. D. Nanavaty", firm=None, is_management="Yes"), + Speaker(name="Moderator", firm=None, is_management="No"), + Speaker(name="Nirav Jimudia", firm="Anvil Research", is_management="No"), + Speaker(name="Bharath Subramanian", firm="Sundaram Mutual Funds", is_management="No"), + Speaker(name="Ahmed Madha", firm="Unifi Capital", is_management="No"), + Speaker(name="Poojan Patel", firm="Mahadev Capital", is_management="No"), + Speaker(name="Nishith Shah", firm="Equitus Investments", is_management="No"), + Speaker(name="Saket Kapoor", firm="Kapoor & Company", is_management="No"), + Speaker(name="S.P. Yadav", firm='Deputy Director', is_management="Yes"), + Speaker(name="Deepak Chitroda", firm="Phillip Capital", is_management="No"), + Speaker(name="Sreemant Dudhoria", firm="Unifi Capital", is_management="No"), + Speaker(name="Priya Mehta", firm="Rishi Finstock", is_management="No"), + Speaker(name="Falguni Dutta", firm="Jet Age Securities", is_management="No"), + ] + self.maxDiff = None + self.assertEqual(speakers, expected) + + def test_extract_speakers_tcs(self): + pdf = "test_files/tcs.pdf" + speakers = get_speakers(pdf) + expected = [ + Speaker(name="Moderator", firm=None, is_management="No"), + Speaker(name="Kedar Shirali", firm="Global Head", is_management="Yes"), + Speaker(name="Rajesh Gopinathan", firm='Chief Executive Officer', is_management="Yes"), + Speaker(name="Samir Seksaria", firm='Chief Financial Officer', is_management="Yes"), + Speaker(name="N.G. Subramaniam", firm='Chief Operating Officer', is_management="Yes"), + Speaker(name="Kumar Rakesh", firm="BNP Paribas", is_management="No"), + Speaker(name="Diviya Nagarajan", firm="UBS", is_management="No"), + Speaker(name="Sandip Agarwal", firm="Edelweiss", is_management="No"), + Speaker(name="Apurva Prasad", firm="HDFC Securities", is_management="No"), + Speaker( + name="Mukul Garg", firm="Motilal Oswal Financial Services", is_management="No" + ), + Speaker(name="Sandeep Shah", firm="Equirus Securities", is_management="No"), + Speaker(name="Ravi Menon", firm="Macquarie", is_management="No"), + Speaker(name="Gaurav Rateria", firm="Morgan Stanley", is_management="No"), ] self.maxDiff = None self.assertEqual(speakers, expected)