CodeAlpha_DataVisualization/data_eda_visualization.py at main · Waheed-6907/CodeAlpha_DataVisualization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#1. Load Dataset
df=pd.read_csv('SampleSuperstore.csv')
df.head() #First Few Rows

#2. Understanding Dataset
print("(Rows,Columns):",df.shape)
print("Column Names:",df.columns)
print("Data Types:",df.info())
print("Summary:",df.describe())

#3. Basic Understanding
print("Total Number of Orders:",len(df))
print("Total Number of Consumers:",df.groupby('City')['Segment'].count())

total_sales=df.groupby('City')['Sales'].sum()
print("\nSales By City:\n",total_sales)
print("\nWhich City has Highest Sales?:\n",total_sales.sort_values(ascending=False))
high_city=total_sales.idxmax()
print(f"{high_city} generates the highest revenue among all cities.")

#4. Sales Analysis
cat_sales=df.groupby('Category')['Sales'].sum()
print("\nCategory-Wise Sales (Top-Bottom):\n",cat_sales.sort_values(ascending=False))
high_cat=cat_sales.idxmax()
print(f"{high_cat} category has been most demanding")
subcat_sales=df.groupby('Sub-Category')['Sales'].sum()
print("\nSub-Category Wise Revenue:\n",subcat_sales.sort_values(ascending=False))
high_subcat=subcat_sales.idxmax()
print(f"{high_subcat} has highest revenue")

#5. Profit Analysis
totalreg_prof=df.groupby('Region')['Profit'].sum()
print("Region-Wise Profits:",totalreg_prof)
print(f"{totalreg_prof.idxmax()} region has the highest profit")
totalcat_prof=df.groupby('Category')['Profit'].sum()
print("Category-Wise Profits:",totalcat_prof)
print(f"{totalcat_prof.idxmax()} category has the highest profit")
loss=df[df['Profit']<0]
loss_prod=loss.groupby('Sub-Category')['Profit'].sum()
print("\nTop 5 Loss Making Products:\n",(loss_prod.sort_values(ascending=True)).head(5))
high_loss=((loss_prod.sort_values(ascending=True)).head(5)).idxmin()
print(f"{high_loss} is most Loss-Making product")

#5. Shipping Analysis
high_ship=df['Ship Mode'].value_counts()
print(high_ship)
print(f"{high_ship.idxmax()} is the most preferred ship class")
ship_prof=df.groupby('Ship Mode')['Profit'].sum()
print(ship_prof.sort_values(ascending=False))
slow_ship=(ship_prof['Standard Class']+ship_prof['Second Class'])
fast_ship=(ship_prof['First Class']+ship_prof['Same Day'])
print("\nDoes faster shipping reduce profit?:")
if(slow_ship>fast_ship):
    print("Yes")
else:
    print("No")
#6. Customer Analysis
print(df.groupby('Segment')['Quantity'].sum())
high_seg=df.groupby('Segment')['Quantity'].sum().idxmax()
print(f"{high_seg} segment buys the most")

#6. Discount Analysis
print(df.groupby('Region')['Discount'].mean())
print(f"{df.groupby('Region')['Discount'].mean().idxmax()} region has most discounts")
totalreg_disc=df.groupby('Region')['Discount'].mean()
print(totalreg_prof,totalreg_disc)
highreg_prof=totalreg_prof.idxmin()
highreg_disc=totalreg_disc.idxmax()
print("\nDoes higher discount reduce profit?:")
print(df[['Discount','Profit']].corr())
#Data Visualization
sns.set(style="whitegrid")
#1. Category vs Sales
cat_sales=cat_sales.sort_values(ascending=False)
cat_sales.plot(kind='bar')
plt.title('Technology Category Drives Highest Revenue')
plt.xlabel('Category')
plt.ylabel('Sales')
plt.show()
#2. Category vs Profit
plt.figure(figsize=(8,5))
totalcat_prof.plot(kind='bar')
plt.title('Category vs Profit')
plt.xlabel('Category')
plt.ylabel('Profit')
plt.show()
#3. Loss-Making Sub Categories
loss=df[df['Profit']<0]
loss_subcat=loss.groupby('Sub-Category')['Profit'].sum().sort_values().head(10)
plt.figure(figsize=(8,5))
loss_subcat.plot(kind='bar')
plt.title('Blinders Drivens to be the most loss-making sub-category')
plt.show()
#4. Ship Mode Usage
plt.figure(figsize=(8,5))
sns.countplot(x='Ship Mode',data=df)
plt.title('Standard Ship Mode is most widely used')
plt.show()
#5. Discount vs Profit
plt.figure(figsize=(8,5))
sns.scatterplot(x='Discount',y='Profit',data=df)
plt.title('More Discounts leads to Less profits')
plt.show()
#6. Correlation Heatmap
plt.figure(figsize=(8,5))
sns.heatmap(df[['Sales','Profit','Discount','Quantity']].corr(),annot=True,cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
#7.Top 10 Sub-Categories by Sales
topcat_sales=df.groupby('Sub-Category')['Sales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(8,5))
topcat_sales.plot(kind='bar')
plt.title('Top 10 Sub-Categories by Sales')
plt.show()