diff --git a/.gitignore b/.gitignore index f5fe7e1..a78c4c9 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -datasets \ No newline at end of file +data/* \ No newline at end of file diff --git a/README.md b/README.md index 5e04947..bd6dc5e 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,51 @@ # 🐟 PhishNet -![PhishNet Art](https://github.com/sirlolcat/PhishNet/assets/85698684/1cc320ce-3878-4cb0-b2fd-c6c757dd82af) +![PhishNet Art](/assets/phishnet-art-base.png) -**DISCLAIMER**: *The content provided by **PhishNet** is exclusively for educational and research purposes **ONLY**. The training data for our GPT-2 derived model has been carefully cleaned to remove any private or personally identifiable information (**PII**) to ensure ethical compliance and privacy. The views and opinions expressed are solely those of the authors and do not reflect any associated organizations. No warranty is provided regarding the accuracy or reliability of the information. Usage of **PhishNet** and its outputs is at your own risk, with no liability for any resultant damages. This project does not endorse illegal activities and should be used responsibly.* +**DISCLAIMER**: *The content provided by **PhishNet** is exclusively for educational and research purposes **ONLY**. The training data for the models under **PhishNet** umbrella; have been carefully cleaned to remove any **sensitive** or **personally identifiable information (PII)** to ensure ethical compliance and privacy. The views and opinions expressed are solely those of the authors and do not reflect any associated organizations. No warranty is provided regarding the accuracy or reliability of the information. Usage of **PhishNet** and its outputs is at your own risk, with no liability for any resultant damages. This project does not endorse illegal activities and should be used responsibly.* ## TL;DR -**PhishNet** is a research project utilizing **Reinforced Self-Training (ReST)** and fine-tuned **GPT-2** to create a high-quality synthetic dataset of phishing emails. Trained on various valuable email datasets (see citations), this project aims to dive into the exploration of adversarial AI and expand our understanding of AI safety. + +**PhishNet** is a research project utilizing **Reinforced Self-Training (ReST)** and fine-tuned set of **Large Language Models (LLMs)** to create a high-quality synthetic dataset of phishing emails. Trained on various valuable email datasets (see [citations](#citations)), this project aims to dive into the exploration of adversarial AI and expand our understanding of AI safety. + +## Table Of Content + +- [Disclaimer](#disclaimer) +- [TL;DR](#tldr) +- [Methodology](#methodology) + - [Data Collection](#data-collection) + - [Model Training](#model-training) + - [Data Generation](#data-generation) + +- [Local Installation & Usage](#local-installation--usage) +- [Results and Evaluation](#results-and-evaluation) +- [Citations](#citations) +- [License](#license) + +## Methodology + +- **Data Collection**: Various datasets such as Enron Email Dataset, Spam Mails Database, etc. +- **Model Training**: Using various LLMs with Reinforced Self-Training. +- **Data Generation**: Synthesizing phishing emails for research. + +## Local Installation & Usage + +If you prefer to run PhishNet locally: + +1. Clone the repository. +2. Install dependencies: `pip install -r requirements.txt`. +3. Run the model with your input data. + +## Results and Evaluation + +This section is under development and will be updated soon. ## Citations + - Radford, A., Wu, J., Child, R., et al. (2019). Language Models are Unsupervised Multitask Learners. [Link](https://github.com/openai/gpt-2) + ``` @article{radford2019language, title={Language Models are Unsupervised Multitask Learners}, @@ -16,7 +53,9 @@ year={2019} } ``` + - Gulcehre, C., Le Paine, T., Srinivasan, S., et al. (2023). Reinforced Self-Training (ReST) for Language Modeling. arXiv preprint arXiv:2308.08998. [Link](https://arxiv.org/abs/2308.08998) + ``` @misc{gulcehre2023reinforced, title={Reinforced Self-Training (ReST) for Language Modeling}, @@ -27,22 +66,12 @@ primaryClass={cs.CL} } ``` -- The Enron Email Dataset. Carnegie Mellon University. [Link](https://www.cs.cmu.edu/~enron/). + - The Enron Email Dataset. Kaggle. [Link](https://www.kaggle.com/datasets/wcukierski/enron-email-dataset) -- Fraudulent Email Corpus. Kaggle. [Link](https://www.kaggle.com/datasets/rtatman/fraudulent-email-corpus) -- Spam Mails Database. Kaggle. [Link](https://www.kaggle.com/datasets/venky73/spam-mails-dataset) - Phishing Email Detection. Kaggle. [Link](https://www.kaggle.com/datasets/subhajournal/phishingemails) - Customer Support Ticket Dataset. Kaggle [Link](https://www.kaggle.com/datasets/suraj520/customer-support-ticket-dataset) - Spam or Not Spam Dataset. Kaggle [Link](https://www.kaggle.com/datasets/ozlerhakan/spam-or-not-spam-dataset) -## Table Of Content -- [Getting Started](#getting-started) - - [Installation](#installation) - - [Usage](#usage) -- [Methodology](#methodology) - - [Data Collection](#data-collection) - - [Model Training](#model-training) - - [Data Generation](#data-generation) -- [Results and Evaluation](#results-and-evaluation) -- [Contributing](#contributing) -- [License](#license) +## License + +**PhishNet** is released under the **MIT** License. The full text of the license can be found via [LICENSE.md](LICENSE). diff --git a/assets/README.md b/assets/README.md new file mode 100644 index 0000000..7038911 --- /dev/null +++ b/assets/README.md @@ -0,0 +1,45 @@ +# 🏞️ PhishNet Assets + +![PhishNet Art](/assets/phishnet-art-assets.png) + +## Overview + +This directory represents a collection, each artwork is a visual representation of what **PhishNet** stands for. These pieces are not just for show; they are carefully designed to reflect the core ideas and goals of our project. As you explore the **PhishNet** documentation and interfaces, these images serve as a quick visual guide to our concept. The images here, referred to as *"PhishNet Art"*, are the result of generative art created by [*DALL.E-3*](https://openai.com/dall-e-3). + +## Artwork Description + +Each piece visually translate this project's principles, to make the documentation more accessible and engaging. Think of these images as a prelude to our research docs, offering an easy-to-grasp visual introduction before diving into the detailed concepts + +### Style and Inspiration + +The artwork is inspired by retro computer software aesthetics, specifically reminiscent of the classic Windows 98 graphics. The design is intentionally pixelated, providing a vintage look that evokes a sense of 1990s tech nostalgia. + +### Theme and Symbolism + +Central to each image is a fish figure, creatively used to symbolize the concept of 'phishing' in the realm of cybersecurity. The fish figure is not just a literal representation of the term 'phishing' but also serves as a playful yet relevant element that ties back to the core theme of the project. + +### Artwork Creation Prompt + +The following prompt was used as the foundation of the *PhishNet Art*: + +``` +Create a retro computer software style logo for 'PhishNet', featuring a fish figure. The design should be reminiscent of classic Windows 98 graphics, with a vintage, pixelated look. The logo should be playful yet relevant to the concept of phishing in cybersecurity, incorporating the fish figure creatively to symbolize the 'phish' in PhishNet. The overall style should evoke a sense of 1990s tech nostalgia. +``` + +## Usage + +The images in this folder are used across various parts of the PhishNet project, including: + +- Main README.md +- PII Removal / Privacy Compliance module +- Privacy Enhancement Analysis documentation + +Each image tailors the [base prompt](#artwork-creation-prompt) subjectively to fit the context in which it is used while maintaining a consistent thematic and stylistic approach. + +## Contributing + +If you wish to contribute to the *PhishNet Art* collection or suggest modifications, please adhere to the style and thematic guidelines outlined above. + +## License + +All *PhishNet Art* is part of the **PhishNet** project and is subject to the same [MIT License](../LICENSE) as the rest of the project. diff --git a/assets/phishnet-art-analysis.png b/assets/phishnet-art-analysis.png new file mode 100644 index 0000000..c9dbf96 Binary files /dev/null and b/assets/phishnet-art-analysis.png differ diff --git a/assets/phishnet-art-assets.png b/assets/phishnet-art-assets.png new file mode 100644 index 0000000..d32118b Binary files /dev/null and b/assets/phishnet-art-assets.png differ diff --git a/assets/phishnet-art-base.png b/assets/phishnet-art-base.png new file mode 100644 index 0000000..d19c24e Binary files /dev/null and b/assets/phishnet-art-base.png differ diff --git a/assets/phishnet-art-pii-removal.png b/assets/phishnet-art-pii-removal.png new file mode 100644 index 0000000..dc0f953 Binary files /dev/null and b/assets/phishnet-art-pii-removal.png differ diff --git a/docs/pii-removal/README.md b/docs/pii-removal/README.md new file mode 100644 index 0000000..ad17ef7 --- /dev/null +++ b/docs/pii-removal/README.md @@ -0,0 +1,40 @@ +# 🔎 Privacy Enhancement Analysis for PII Removal + +![PhishNet Art](/assets/phishnet-art-analysis.png) + +## Overview + +This documentation provides an in-depth analysis of the privacy enhancement results generated by the PII Removal module in the **PhishNet** project. The visualization aids in understanding the effectiveness and the scientific rigor behind the data sanitization process. + +## Table of Contents + +- [Overview of Visualization](#overview-of-visualization) + - [Green Bars (Number of Replacements)](#green-bars-number-of-replacements) + - [Blue Line (Improvement Score)](#blue-line-improvement-score) +- [Scientific Implications](#scientific-implications) +- [Interpretation of Results](#interpretation-of-results) +- [Conclusion](#conclusion) + +## Detailed Explanation + +Instances of `_replacements_and_improvement_chart.png` chart demonstrate the PII removal process's effectiveness across the given dataset processed on a **T4** GPU. The green bars and blue line represent the count of PII entities removed and the improvement score, respectively, providing quantitative insights into the privacy enhancement achieved. + +### Green Bars (Number of Replacements) + +The number of green bars corresponds to the absolute number of PII entities detected and anonymized. This serves as an indicator of the tool's effectiveness in identifying and obfuscating sensitive information. + +### Blue Line (Improvement Score) + +The blue line reflects the improvement score, a normalized metric that indicates the extent of PII removed from the dataset. It is a relative measure that standardizes the comparison of privacy enhancement across different chunks, irrespective of their size. + +## Mathematical Implications + +The variation in the metrics reflects the inherent inconsistencies in PII distribution, which is typical of real-world datasets. The improvement score is a testament to the tool's ability to handle diverse data sets, providing a statistically sound measure of its performance. + +## Interpretation of Results + +The results depicted in the chart provide a visual and quantitative analysis of the PII removal module's performance. They are crucial for verifying the module's effectiveness in enhancing privacy and maintaining data integrity. + +## Conclusion + +The privacy enhancement visualization underscores the project's commitment to ethical data practices and its contribution to the field of AI safety and data privacy. It exemplifies the scientific foundation underpinning the **PhishNet** project's approach to PII removal. diff --git a/docs/pii-removal/phishing_mail_replacements_and_improvement_chart.png b/docs/pii-removal/phishing_mail_replacements_and_improvement_chart.png new file mode 100644 index 0000000..a96c8fc Binary files /dev/null and b/docs/pii-removal/phishing_mail_replacements_and_improvement_chart.png differ diff --git a/requirements.txt b/requirements.txt index 70db91d..e7ffc15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,32 +1,63 @@ accelerate==0.24.1 +annotated-types==0.6.0 bleach==6.1.0 +blis==0.7.11 +catalogue==2.0.10 certifi==2023.11.17 charset-normalizer==3.3.2 +click==8.1.7 +cloudpathlib==0.16.0 +confection==0.1.4 +contourpy==1.2.0 +cycler==0.12.1 +cymem==2.0.8 filelock==3.13.1 +fonttools==4.45.1 fsspec==2023.10.0 huggingface-hub==0.19.4 idna==3.4 Jinja2==3.1.2 kaggle==1.5.16 +kiwisolver==1.4.5 +langcodes==3.3.0 MarkupSafe==2.1.3 +matplotlib==3.8.2 mpmath==1.3.0 +murmurhash==1.0.10 networkx==3.2.1 numpy==1.26.2 packaging==23.2 +pandas==2.1.3 +Pillow==10.1.0 +preshed==3.0.9 psutil==5.9.6 +pydantic==2.5.2 +pydantic_core==2.14.5 +pyparsing==3.1.1 python-dateutil==2.8.2 python-slugify==8.0.1 +pytz==2023.3.post1 PyYAML==6.0.1 regex==2023.10.3 requests==2.31.0 safetensors==0.4.0 six==1.16.0 +smart-open==6.4.0 +spacy==3.7.2 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.4.8 sympy==1.12 text-unidecode==1.3 +thinc==8.2.1 tokenizers==0.15.0 torch==2.1.1 tqdm==4.66.1 transformers==4.35.2 +typer==0.9.0 typing_extensions==4.8.0 +tzdata==2023.3 urllib3==2.1.0 +wasabi==1.1.2 +weasel==0.3.4 webencodings==0.5.1 diff --git a/scripts/download_base_datasets.py b/scripts/download_base_datasets.py index 9edf539..573f728 100644 --- a/scripts/download_base_datasets.py +++ b/scripts/download_base_datasets.py @@ -1,45 +1,49 @@ """ -This module downloads various datasets from Kaggle and saves them locally. +This script downloads various datasets from Kaggle and saves them in the 'data/raw' directory. """ import os -import requests from tqdm import tqdm from kaggle.api.kaggle_api_extended import KaggleApi -def download_file(url, filename): - """ - Downloads a file from the given URL and saves it to the specified filename. - """ - with requests.get(url, stream=True, timeout=10) as r: # Added timeout - total_length = int(r.headers.get('content-length')) - with open(filename, 'wb') as f: - for chunk in tqdm(r.iter_content(chunk_size=1024), total=total_length//1024, unit='KB', desc=f'Downloading {filename}'): - if chunk: - f.write(chunk) -if not os.path.exists('datasets'): - os.makedirs('datasets') - # Kaggle datasets kaggle_datasets = [ "wcukierski/enron-email-dataset", "rtatman/fraudulent-email-corpus", - "venky73/spam-mails-dataset", "subhajournal/phishingemails", "suraj520/customer-support-ticket-dataset", "ozlerhakan/spam-or-not-spam-dataset", ] -# Initialize Kaggle API -api = KaggleApi() -api.authenticate() +# Function to initialize Kaggle API +def initialize_kaggle_api(): + api_instance = KaggleApi() + api_instance.authenticate() + return api_instance + +# Function to download datasets from Kaggle and extract them +def download_and_extract_dataset(api_instance, dataset_identifier, path='data/raw'): + dataset_key = dataset_identifier.split('/')[-1] + dataset_path = f'{path}/{dataset_key}' -# Download Kaggle datasets -for dataset in kaggle_datasets: - dataset_key = dataset.rsplit('/', maxsplit=1)[-1] - dataset_path = f'datasets/{dataset_key}' if not os.path.exists(dataset_path): - print(f'Trying to download {dataset_key}...') - api.dataset_download_files(dataset, path='datasets', unzip=True, quiet=False) + api_instance.dataset_download_files(dataset_identifier, path=path, unzip=True) + tqdm.write(f'Dataset {dataset_key} downloaded and extracted in {path}.') + else: + tqdm.write(f'Dataset {dataset_key} already exists in {path}.') + +if __name__ == "__main__": + # Initialize Kaggle API + api = initialize_kaggle_api() + + # Create 'data/raw' directory if it doesn't exist + if not os.path.exists('data/raw'): + os.makedirs('data/raw') + + # Download Kaggle datasets + with tqdm(total=len(kaggle_datasets), desc='Downloading datasets') as pbar: + for dataset in kaggle_datasets: + download_and_extract_dataset(api, dataset) + pbar.update(1) -print("Datasets downloaded and renamed successfully.") + print("All datasets have been downloaded and extracted successfully.") diff --git a/scripts/merge_datasets.py b/scripts/merge_datasets.py new file mode 100644 index 0000000..55899c2 --- /dev/null +++ b/scripts/merge_datasets.py @@ -0,0 +1,39 @@ +import os +import pandas as pd + +def merge_datasets(raw_data_dir, processed_data_dir, output_filename): + # Define the path for raw and processed data directories + raw_data_path = os.path.join(raw_data_dir) + processed_data_path = os.path.join(processed_data_dir) + + # Ensure the processed data directory exists + if not os.path.exists(processed_data_path): + os.makedirs(processed_data_path) + + # List of dataset filenames in the raw data directory + dataset_filenames = [f for f in os.listdir(raw_data_path) if os.path.isfile(os.path.join(raw_data_path, f))] + + # Initialize an empty DataFrame to append data + combined_df = pd.DataFrame() + + # Read and append all datasets into one DataFrame + for filename in dataset_filenames: + file_path = os.path.join(raw_data_path, filename) + # Read the CSV file and append it + if filename.endswith('.csv'): + df = pd.read_csv(file_path) + combined_df = combined_df._append(df, ignore_index=True) + elif filename.endswith('.txt'): # Assuming the .txt file is in a readable format + df = pd.read_csv(file_path, sep='\t') + combined_df = combined_df._append(df, ignore_index=True) + + # Save the combined data to a new CSV file in the processed data directory + combined_df.to_csv(os.path.join(processed_data_path, output_filename), index=False) + print(f"Combined dataset saved to {os.path.join(processed_data_path, output_filename)}") + +if __name__ == "__main__": + RAW_DATA_DIR = 'data/raw' + PROCESSED_DATA_DIR = 'data/processed' + OUTPUT_FILENAME = 'combined_dataset.csv' + + merge_datasets(RAW_DATA_DIR, PROCESSED_DATA_DIR, OUTPUT_FILENAME) diff --git a/src/pii-removal/README.md b/src/pii-removal/README.md new file mode 100644 index 0000000..5878e30 --- /dev/null +++ b/src/pii-removal/README.md @@ -0,0 +1,65 @@ +# 🛡️ PhishNet - PII Removal / Privacy Compliance + +![PhishNet Art](/assets/phishnet-art-pii-removal.png) + +## Table of Contents + +- [Introduction](#introduction) +- [Objective](#objective) +- [How It Works](#how-it-works) +- [Usage](#usage) +- [Technologies Used](#technologies-used) +- [Results and Methodological Approach](#results-and-methodological-approach) + - [Quantitative Metrics](#quantitative-metrics) +- [Contributing](#contributing) +- [Disclaimer](#disclaimer) +- [License](#license) + +## Introduction + +As part of our commitment to ethical AI research and development, this module is dedicated to the meticulous removal of any private, sensitive, or personally identifiable information (PII) from our datasets. Ensuring privacy and compliance is paramount in our quest to create a high-quality synthetic dataset of phishing emails. + +## Objective + +Our primary goal is to pre-process and sanitize the datasets used in **PhishNet** training, effectively stripping out all **PII** and **sensitive** information. This process, conducted on **T4** GPUs, is crucial in maintaining the integrity of our research and upholding our ethical standards. We ensure that our model is **NEVER** trained on `raw` datasets, only on data that has been thoroughly sanitized for PII (`processed`). + +## How It Works + +- **Automated Detection**: Utilizing Named-Entity Recognition (**NER**), this tool automatically identifies potential PII in raw datasets under `data/raw`. (PII is defined as: `EMAIL`, `PERSON`, `ORG`, `CARDINAL`, `GPE`, `LOC`) +- **Removal Process**: Once detected, PII is obfuscated or anonymized to ensure no sensitive information is retained. +- **Quality Assurance**: Post-processing checks are conducted to verify the thoroughness of PII removal. + +## Usage + +1. **Preparation**: Ensure that raw datasets are placed in the `data/raw` directory. +2. **Execution**: Run the PII Removal script by executing `python3 pii-removal.py` directly. +3. **Output**: Processed datasets, free from PII, are saved in `data/processed`. + +## Technologies Used + +- **NLP Library**: [Spacy](https://spacy.io/) +- **Programming Language**: [Python 3.10](https://www.python.org/downloads) +- **Regular Expressions**: [re](https://docs.python.org/3/library/re.html) + +## Results and Methodological Approach + +The process of PII removal for each raw dataset is encapsulated respectively in charts named `_replacements_and_improvement_chart.png`, found in the `docs/pii-removal` directory. Instances of this chart illustrate both the quantity of PII entities removed (represented by the green bars) and the relative improvement in privacy (depicted by the blue line) for each chunk of data processed. + +### Quantitative Metrics + +- **Number of Replacements**: Indicates the count of PII entities detected and removed per chunk, providing a direct measure of the tool's activity. +- **Improvement Score**: A normalized measure indicating the proportion of text altered due to PII removal. It is calculated as the ratio of the number of characters removed to the total characters in the chunk, providing a standardized metric for comparing privacy enhancement across chunks. + +For a comprehensive explanation of the chart and its significance in the context of privacy enhancement, please refer to the README in the `docs/pii-removal` folder. + +## Contributing + +We appreciate contributions from the community! If you have suggestions or improvements, feel free to fork the repository and submit a pull request. + +## Disclaimer + +*The PII Removal tool is a part of the **PhishNet** project, which is intended strictly for educational and research purposes **ONLY**. While we strive for accuracy, we offer no warranty regarding the completeness of PII removal. Usage is at your own risk.* + +## License + +This PII Removal module, like the rest of the **PhishNet** project, is released under the [MIT License](LICENSE). diff --git a/src/pii-removal/pii_removal.py b/src/pii-removal/pii_removal.py new file mode 100644 index 0000000..380a29f --- /dev/null +++ b/src/pii-removal/pii_removal.py @@ -0,0 +1,114 @@ +import os +import re +import spacy +import pandas as pd +from tqdm.auto import tqdm +import matplotlib.pyplot as plt +from concurrent.futures import ProcessPoolExecutor + +# Constants and configurations +NLP_MODEL = 'en_core_web_md' +PII_LABELS = {'PERSON', 'ORG', 'CARDINAL', 'GPE', 'LOC'} +EMAIL_REGEX = re.compile(r'\b\S+@\S+\.\S+\b') +MAX_LENGTH = 2000000 + +# Initialize spaCy model with configurations +def init_spacy_model(): + nlp = spacy.load(NLP_MODEL, disable=['parser', 'tagger', 'lemmatizer']) + nlp.max_length = MAX_LENGTH + return nlp + +nlp = init_spacy_model() + +# Function to remove PII using regex and NER +def remove_pii(text, nlp): + text = re.sub(EMAIL_REGEX, '[EMAIL]', text) + doc = nlp(text) + replacements_count = 0 + for ent in doc.ents: + if ent.label_ in PII_LABELS: + text = text.replace(ent.text, f'[{ent.label_}]') + replacements_count += 1 + return text, replacements_count + +# Process a single row of data +def process_row(row, text_columns, nlp): + row_replacements = 0 + for col in text_columns: + text = row[col] + if isinstance(text, str): + cleaned_text, replacements = remove_pii(text, nlp) + row[col] = cleaned_text + row_replacements += replacements + return row, row_replacements + +# Apply PII removal to a chunk of data +def process_chunk(chunk, text_columns, nlp): + chunk_replacements = 0 + for i, row in chunk.iterrows(): + processed_row, row_replacements = process_row(row, text_columns, nlp) + chunk.iloc[i] = processed_row + chunk_replacements += row_replacements + return chunk, chunk_replacements + +# Parallel processing of chunks +def parallel_process_chunks(chunks, text_columns, nlp): + processed_chunks = [] + replacements_tracker = [] + with ProcessPoolExecutor() as executor: + futures = [executor.submit(process_chunk, chunk, text_columns, nlp) for chunk in chunks] + for future in tqdm(futures, total=len(futures), desc="Processing Chunks"): + processed_chunk, chunk_replacements = future.result() + processed_chunks.append(processed_chunk) + replacements_tracker.append(chunk_replacements) + return processed_chunks, replacements_tracker + +# Calculate improvement score based on replacements +def calculate_improvement_score(replacements_tracker, data_length): + total_replacements = sum(replacements_tracker) + return total_replacements / (data_length * MAX_LENGTH) + +# Process the CSV in chunks and apply PII removal +def process_csv_in_chunks(file_path, chunksize=1000): + # Get text columns to process + sample_df = pd.read_csv(file_path, nrows=0) + text_columns = sample_df.select_dtypes(include=['object']).columns.tolist() + + # Read the dataset in chunks + chunks = [chunk for chunk in pd.read_csv(file_path, chunksize=chunksize)] + cleaned_file_path = file_path.replace('.csv', '_cleaned.csv') + + # Process chunks in parallel + processed_chunks, replacements_tracker = parallel_process_chunks(chunks, text_columns, nlp) + + # Calculate the improvement score + improvement_score = calculate_improvement_score(replacements_tracker, len(chunks)) + + # Write processed data back to a cleaned CSV + for i, processed_chunk in enumerate(processed_chunks): + mode = 'w' if i == 0 else 'a' + header = True if i == 0 else False + processed_chunk.to_csv(cleaned_file_path, index=False, mode=mode, header=header) + + return replacements_tracker, improvement_score + +# Main execution block +if __name__ == "__main__": + script_dir = os.path.dirname(os.path.realpath(__file__)) + data_dir = os.path.join(script_dir, '..', '..', 'data', 'processed') + file_name = 'combined_dataset.csv' + file_path = os.path.join(data_dir, file_name) + + replacements_tracker, improvement_score = process_csv_in_chunks(file_path, chunksize=3000) + + # Plotting the results + fig, ax1 = plt.subplots() + ax2 = ax1.twinx() + ax1.bar(range(len(replacements_tracker)), replacements_tracker, color='g') + ax2.plot(range(len(replacements_tracker)), [improvement_score] * len(replacements_tracker), color='b', linestyle='--') + ax1.set_xlabel('Chunk Index') + ax1.set_ylabel('Number of Replacements', color='g') + ax2.set_ylabel('Improvement Score', color='b') + plt.title('PII Replacements and Improvement Score per Chunk') + plt.savefig(os.path.join(script_dir, '..', '..', 'docs', 'replacements_chart.png')) + plt.show()