Core processing engine for SenderStats.
This library provides the underlying logic for analyzing outbound email traffic, identifying top senders, and generating structured reporting data from Proofpoint Smart Search exports or CSV datasets.
pip install senderstats-core`
from types import SimpleNamespace
from senderstats_core.common.defaults import *
from senderstats_core.data.data_source_type import DataSourceType
from senderstats_core.processing.config_manager import ConfigManager
from senderstats_core.processing.data_source_manager import DataSourceManager
from senderstats_core.processing.pipeline_manager import PipelineManager
from senderstats_core.processing.pipeline_processor import PipelineProcessor
from senderstats_core.reporting.pipeline_processor_report import PipelineProcessorReport
def process():
# Create args namespace
args = SimpleNamespace()
args.debug = True
args.source_type = DataSourceType.CSV
args.input_files = ['file1.csv','file2.csv']
args.output_file = 'output.xlsx'
args.ip_field = DEFAULT_IP_FIELD
args.mfrom_field = DEFAULT_MFROM_FIELD
args.hfrom_field = DEFAULT_HFROM_FIELD
args.rcpts_field = DEFAULT_RCPTS_FIELD
args.rpath_field = DEFAULT_RPATH_FIELD
args.msgid_field = DEFAULT_MSGID_FIELD
args.subject_field = DEFAULT_SUBJECT_FIELD
args.msgsz_field = DEFAULT_MSGSZ_FIELD
args.date_field = DEFAULT_DATE_FIELD
args.gen_hfrom = False
args.gen_rpath = False
args.gen_alignment = False
args.gen_msgid = False
args.expand_recipients = False
args.no_display = False
args.remove_prvs = False
args.decode_srs = False
args.normalize_bounces = False
args.normalize_entropy = False
args.no_empty_hfrom = False
args.sample_subject = False
args.with_probability = True
args.exclude_ips = []
args.exclude_domains = []
args.restrict_domains = []
args.exclude_senders = []
args.exclude_dup_msgids = False
args.date_format = DEFAULT_DATE_FORMAT
args.no_default_exclude_domains = False
args.no_default_exclude_ips = False
config = ConfigManager(args)
# Treat "no input files" as an error path (not a silent return)
if not config.input_files:
raise ValueError("No input files to read. Please check the input files exist.")
# Show current filtering options selected
config.display_filter_criteria()
# Data source manager can be extended to read streams, etc. Default to CSV
data_source_manager = DataSourceManager(config)
# Pipeline manager is responsible for which filter, transform, processor is triggered and the order of them
pipeline_manager = PipelineManager(config)
# Pipeline processor does the internal processing based in the DataSource interface.
processor = PipelineProcessor(data_source_manager, pipeline_manager)
# Processes the data based on the DataSourceManager(), essentially calling read_data till all data has been processed.
processor.process_data()
# Display a summary for data processed
pipeline_manager.get_filter_manager().display_summary()
# Create a pipeline processor report
report = PipelineProcessorReport(config.output_file, pipeline_manager, config.with_probability)
# Enumerates the reportable processing nodes to generate the report.
report.generate()
# Save the data to args.output_file
report.close()