data-visualisation/datav.py at main · HopeFlynn/data-visualisation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# ----------------------------------------
# Step 1: Load and Explore the Dataset
# ----------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

# Load iris dataset
try:
    iris = load_iris()
    df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    df['species'] = iris.target
    df['species'] = df['species'].map({i: name for i, name in enumerate(iris.target_names)})
    print("Dataset loaded successfully.\n")
except FileNotFoundError:
    print("Error: File not found.")
except Exception as e:
    print(f"An error occurred: {e}")

# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Check data types and missing values
print("\nData types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

# Clean dataset (not necessary here, but shown for learning)
df_cleaned = df.dropna()

# ----------------------------------------
# Step 2: Basic Data Analysis
# ----------------------------------------

# Basic statistics
print("\nBasic Statistics:")
print(df.describe())

# Group by species and compute means
grouped_means = df.groupby('species').mean()
print("\nAverage values per species:")
print(grouped_means)

# Print interesting insights
print("\nSpecies with the highest average for each feature:")
for feature in df.columns[:-1]:  # skip 'species' column
    max_species = grouped_means[feature].idxmax()
    print(f" - {feature}: {max_species}")

# ----------------------------------------
# Step 3: Data Visualization
# ----------------------------------------

sns.set(style="whitegrid")  # use seaborn style

# Line Chart
plt.figure(figsize=(10, 5))
plt.plot(df.index, df['petal length (cm)'], label='Petal Length', color='green')
plt.title("Line Chart: Petal Length Over Index")
plt.xlabel("Index")
plt.ylabel("Petal Length (cm)")
plt.legend()
plt.grid(True)
plt.show()

# Bar Chart
plt.figure(figsize=(8, 5))
sns.barplot(data=df, x='species', y='petal length (cm)', ci=None)
plt.title("Bar Chart: Average Petal Length per Species")
plt.xlabel("Species")
plt.ylabel("Average Petal Length (cm)")
plt.show()

# Histogram
plt.figure(figsize=(8, 5))
sns.histplot(df['sepal width (cm)'], kde=True, bins=20, color='skyblue')
plt.title("Histogram: Distribution of Sepal Width")
plt.xlabel("Sepal Width (cm)")
plt.ylabel("Frequency")
plt.show()

# Scatter Plot
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='sepal length (cm)', y='petal length (cm)', hue='species')
plt.title("Scatter Plot: Sepal Length vs Petal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Petal Length (cm)")
plt.legend(title='Species')
plt.show()