-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatav.py
More file actions
96 lines (79 loc) · 2.66 KB
/
datav.py
File metadata and controls
96 lines (79 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# ----------------------------------------
# Step 1: Load and Explore the Dataset
# ----------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
# Load iris dataset
try:
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target
df['species'] = df['species'].map({i: name for i, name in enumerate(iris.target_names)})
print("Dataset loaded successfully.\n")
except FileNotFoundError:
print("Error: File not found.")
except Exception as e:
print(f"An error occurred: {e}")
# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())
# Check data types and missing values
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())
# Clean dataset (not necessary here, but shown for learning)
df_cleaned = df.dropna()
# ----------------------------------------
# Step 2: Basic Data Analysis
# ----------------------------------------
# Basic statistics
print("\nBasic Statistics:")
print(df.describe())
# Group by species and compute means
grouped_means = df.groupby('species').mean()
print("\nAverage values per species:")
print(grouped_means)
# Print interesting insights
print("\nSpecies with the highest average for each feature:")
for feature in df.columns[:-1]: # skip 'species' column
max_species = grouped_means[feature].idxmax()
print(f" - {feature}: {max_species}")
# ----------------------------------------
# Step 3: Data Visualization
# ----------------------------------------
sns.set(style="whitegrid") # use seaborn style
# Line Chart
plt.figure(figsize=(10, 5))
plt.plot(df.index, df['petal length (cm)'], label='Petal Length', color='green')
plt.title("Line Chart: Petal Length Over Index")
plt.xlabel("Index")
plt.ylabel("Petal Length (cm)")
plt.legend()
plt.grid(True)
plt.show()
# Bar Chart
plt.figure(figsize=(8, 5))
sns.barplot(data=df, x='species', y='petal length (cm)', ci=None)
plt.title("Bar Chart: Average Petal Length per Species")
plt.xlabel("Species")
plt.ylabel("Average Petal Length (cm)")
plt.show()
# Histogram
plt.figure(figsize=(8, 5))
sns.histplot(df['sepal width (cm)'], kde=True, bins=20, color='skyblue')
plt.title("Histogram: Distribution of Sepal Width")
plt.xlabel("Sepal Width (cm)")
plt.ylabel("Frequency")
plt.show()
# Scatter Plot
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='sepal length (cm)', y='petal length (cm)', hue='species')
plt.title("Scatter Plot: Sepal Length vs Petal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Petal Length (cm)")
plt.legend(title='Species')
plt.show()