SpamClassifier/DectectorApp.py at main · SimbongeN/SpamClassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
deploying spam model as web application
"""
import streamlit as st
import pandas as pd
import pickle
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#download stopwrds and wordnet
nltk.download('stopwords')
nltk.download('wordnet')

#load vectorizer and model
vectorizer = pickle.load(open("vectortizer.pkl","rb"))
model = pickle.load(open("model.pkl","rb"))

#header
st.header("Spam Detector Appliction :sunglasses:",divider='rainbow')

#greeting message
st.write(f'''
         Welcome to our Spam Classifier! 📧🔍
         This application helps you determine whether an email , sms or message
         is likely to be spam or not. Simply input the email, sms content or content of the message, and our model
         will analyze it for you. Let’s keep those inboxes clean!
''')

#developer details
st.write("Developed by :rainbow[Simbonge Ndlovu]")
st.link_button("Visit My GitHub", "https://github.com/SimbongeN")

st.divider()#content divider

# display statictics of data such as how
# much data was the model trained with acc and precision
modelData = pd.read_csv('SpamDataSet.csv',sep=";",encoding="ISO-8859-1")
num_rows = modelData.shape[0] #get number of data used
category = modelData['Category'].value_counts()
spam_count = category.get('spam')
orginal = 7993

#Display the Models data
col1, col2, col3= st.columns(3)
col1.metric("Training Data", str(num_rows)+"+")
col2.metric("Model Accuracy", "95.9%")
col3.metric("Model Precision", "98.3%")

st.divider()#content divider

#make user test model by enetering email, sms or message content
st.subheader("Classify Content")
user_input = st.text_area("Message to analys",placeholder="Enter your Email, sms or Message content here",height=250)#user textarea

#preprocess the data given by the user
cleanedMessage = ''.join(map(lambda x: x.replace("\r\n", " "), user_input)) #removing new line regex
cleanedMessage = ''.join(map(lambda x: x.replace("\r\n", " "), cleanedMessage)) #removing new line regex

Lemmatizer = WordNetLemmatizer() #setting up Lemmatizer
stop_words = set(stopwords.words('english'))#set up stopwords

#remove all unwanted punctions in data
Message = cleanedMessage.lower() #make every word to lower case
Message = Message.translate(str.maketrans('','',string.punctuation)).split() #removing punction
Message = [Lemmatizer.lemmatize(word) for word in Message if word not in stop_words] #removing stop words
Message = ' '.join(Message)

#vectorize user input
vector_input = vectorizer.transform([Message])
#model prediction classify user data
result = model.predict(vector_input)

#display user data
if st.button(":red[Classify ]"):
    if result == 1:
        st.subheader("SPAM")
        spam_count += 1
    else:
        st.subheader("NOT SPAM")

diff = spam_count - orginal
st.divider()#content divider

#add change in spam emails and sms classified
#st.metric("Spam Detected", spam_count, str(diff)+"+")