-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtextual_data_keyword_extractor.py
More file actions
151 lines (135 loc) · 7.8 KB
/
Copy pathtextual_data_keyword_extractor.py
File metadata and controls
151 lines (135 loc) · 7.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import csv
import os
import sys
import time
import datetime
import re
def print_tdke():
tdke_art = """
_____ _____ _____ _____
/\ \ /\ \ /\ \ /\ \
/::\ \ /::\ \ /::\____\ /::\ \
\:::\ \ /::::\ \ /:::/ / /::::\ \
\:::\ \ /::::::\ \ /:::/ / /::::::\ \
\:::\ \ /:::/\:::\ \ /:::/ / /:::/\:::\ \
\:::\ \ /:::/ \:::\ \ /:::/____/ /:::/__\:::\ \
/::::\ \ /:::/ \:::\ \ /::::\ \ /::::\ \:::\ \
/::::::\ \ /:::/ / \:::\ \ /::::::\____\________ /::::::\ \:::\ \
/:::/\:::\ \ /:::/ / \:::\ ___\ /:::/\:::::::::::\ \ /:::/\:::\ \:::\ \
/:::/ \:::\____\/:::/____/ \:::| |/:::/ |:::::::::::\____\/:::/__\:::\ \:::\____|
/:::/ \::/ /\:::\ \ /:::|____|\::/ |::|~~~|~~~~~ \:::\ \:::\ \::/ /
/:::/ / \/____/ \:::\ \ /:::/ / \/____|::| | \:::\ \:::\ \/____/
/:::/ / \:::\ \ /:::/ / |::| | \:::\ \:::\ \
/:::/ / \:::\ /:::/ / |::| | \:::\ \:::\____\
\::/ / \:::\ /:::/ / |::| | \:::\ \::/ /
\/____/ \:::\/:::/ / |::| | \:::\ \/____/
\::::::/ / |::| | \:::\ \
\::::/ / \::| | \:::\____\
\::/____/ \:| | \::/ /
~~ \|___| \/____/
Textual Data Keyword Extractor
"""
print(tdke_art)
# function takes a record and a list of keywords, and returns a list of the keywords that are found in the record's text.
# deprecate: this method uses the 'in' operator to check if a keyword is in the text. This is case-insensitive and does not account for word boundaries.
def get_matched_words_nobound(record, keywords, text_column_name):
text = record[text_column_name].lower() # Column containing the processed textual data
matched_words = [keyword for keyword in keywords if keyword.lower() in text]
return ', '.join(matched_words)
# function to match only full words using regular expressions
def get_matched_words(record, keywords, text_column_name):
text = record[text_column_name].lower() # Column containing the processed textual data
matched_words = [keyword for keyword in keywords if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', text)]
return ', '.join(matched_words)
# function takes a CSV file, a list of keywords, and a limit_rows flag, and writes to a new CSV file only the records that contain at least one of the keywords.
def extract_records(input_csv_file, output_csv_file, keywords, limit_rows, text_column_name):
extracted_records = []
with open(input_csv_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
if limit_rows == '-y':
for i, row in enumerate(reader):
if i >= 100: # Limit rows to 100
break
matched_words = get_matched_words(row, keywords, text_column_name)
if matched_words: # Only add the record if there are matched words
row['matched_words'] = matched_words
extracted_records.append(row)
else:
for row in reader:
matched_words = get_matched_words(row, keywords, text_column_name)
if matched_words: # Only add the record if there are matched words
row['matched_words'] = matched_words
extracted_records.append(row)
if extracted_records:
# Add matched words to each record
for record in extracted_records:
record['matched_words'] = get_matched_words(record, keywords, text_column_name)
headers = extracted_records[0].keys()
output_dir = './output/tdke/'
os.makedirs(output_dir, exist_ok=True) # Create output directory if it doesn't exist
with open(os.path.join(output_dir, output_csv_file), 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
writer.writerows(extracted_records)
print(f"Extracted records containing keywords {keywords} saved to {output_csv_file}")
else:
print("No records containing the specified keywords found.")
# Record the end time
end_time = time.time()
# Calculate and print the duration of the script execution
duration = end_time - start_time
# Calculate the duration in minutes
duration_min = duration / 60
# Calculate the duration in hours
duration_hrs = duration_min / 60
# Save to a log file with the script execution details
log_file_path = './log/log.txt'
with open(log_file_path, 'a') as log_file:
log_file.write(f"Script execution details:\n")
now = datetime.datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
log_file.write(f"*** TDKE Script Log item ***\n")
log_file.write(f"Date and time: {dt_string}\n")
log_file.write(f"CSV file path: {input_csv_file}\n")
log_file.write(f"Text column name: {text_column_name}\n")
log_file.write(f"Output prefix: {output_csv_file}\n")
log_file.write(f"Number of rows processed: {i + 1 if limit_rows == '-y' else 'all'}\n")
log_file.write(f"Processing time (sec): {duration:.2f} seconds.\n")
log_file.write(f"Processing time (min): {duration_min:.2f} minutes.\n")
log_file.write(f"Processing time (hrs): {duration_hrs:.2f} hours.\n")
log_file.write(f"Table output has been saved to '{os.path.join(output_dir, output_csv_file)}'.\n")
log_file.write(f"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n")
if __name__ == "__main__":
print_tdke()
if len(sys.argv) != 5:
print("Usage: python your_script.py <csv_file_path> <text_column_name> <output_prefix> <limit_rows:-y/-n>")
sys.exit(1)
start_time = time.time()
csv_file_path = sys.argv[1]
text_column_name = sys.argv[2]
output_prefix = sys.argv[3]
now = datetime.datetime.now()
output_prefix += now.strftime("_%Y%m%d%H%M%S")
limit_rows = sys.argv[4]
input_csv_file = csv_file_path
output_csv_file = f'{output_prefix}_extracted_records.csv'
keywords = ['patient fall',
'patient fell',
'falls',
'falling',
'fell from bed',
'fall from bed',
'sore',
'sores',
'pressure injuries',
'pressure sores',
'bedsore',
'bedsores',
'bed sore',
'bed sores',
'abscond',
'absconded',
'absconding',
'abscondment',
'abscondments']
extract_records(input_csv_file, output_csv_file, keywords, limit_rows, text_column_name)