-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathproject_cleaning_players.py
More file actions
102 lines (67 loc) · 3.29 KB
/
project_cleaning_players.py
File metadata and controls
102 lines (67 loc) · 3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
"""Project_CLEANING_Players.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1iecy5gv1oFMBgWBAc3JxxAFEMP3bOY1m
"""
import pandas as pd
from google.colab import files
uploaded=files.upload()
players_data = pd.read_csv('modified_players_data.csv')
players_data = pd.read_csv('players_age.csv')
# Input the player name you want to search for
search_name = 'Winston, Jameis'
# Search for Tom Brady and print True if found, else False
player_exists = search_name in players_data['Name'].values
print(player_exists)
# Convert the 'Birthday' column to datetime format with errors='coerce' to handle invalid dates
players_data['Birthday'] = pd.to_datetime(players_data['Birthday'], format='%m/%d/%Y', errors='coerce')
# Drop rows with NaT (Not a Time) values
players_data = players_data.dropna(subset=['Birthday'])
# Find the maximum date and corresponding player
max_date_player = players_data.loc[players_data['Birthday'].idxmax()]
# Print the maximum date and player name
print("Maximum Date:", max_date_player['Birthday'].strftime('%m/%d/%Y'))
print("Player Name:", max_date_player['Name'])
"""Dropping columns we won't need"""
# List of columns to drop
columns_to_drop = ['Birth Place', 'College', 'Current Status', 'Current Team', 'High School', 'High School Location', 'Number', 'Player Id']
# Drop the specified columns
players_data = players_data.drop(columns=columns_to_drop, errors='ignore')
# Display the modified DataFrame
print(players_data)
"""Finding null value count in all columns"""
for column in players_data.columns:
null_count = players_data[column].isnull().sum()
if null_count > 0:
print(f"Column '{column}' has {null_count} null values.")
"""Dropping all columns with null birthdays"""
# Drop rows with null values in the 'Birthday' column
players_data = players_data.dropna(subset=['Birthday'])
# Print the modified DataFrame
print(players_data)
"""Change the format of the Name column to match our other database"""
# Rearrange 'Name' column to 'FirstName LastName'
players_data['Name'] = players_data['Name'].str.split(', ').str[1] + ' ' + players_data['Name'].str.split(', ').str[0]
# Print the modified DataFrame
print(players_data)
"""Switching the Birthday column from "MM/DD/YYYY" to "YYYY-MM-DD"
"""
# Convert 'Birthday' column to datetime with inferred format
players_data['Birthday'] = pd.to_datetime(players_data['Birthday'], infer_datetime_format=True, errors='coerce')
# Format 'Birthday' column as 'YYYY-MM-DD'
players_data['Birthday'] = players_data['Birthday'].dt.strftime('%Y-%m-%d')
# Print the modified DataFrame
print(players_data)
"""Double checking to see if there are any null values in Name or Birthday"""
# Check for null values in 'Name' and 'Birthday' columns
null_name = players_data['Name'].isnull().any()
null_birthday = players_data['Birthday'].isnull().any()
# Print the result
print(f"Null values in 'Name' column: {null_name}")
print(f"Null values in 'Birthday' column: {null_birthday}")
"""There are still 3 null values in Birthday, removing them now"""
# Drop rows with null values in the 'Birthday' column
players_data = players_data.dropna(subset=['Birthday'])
players_data.to_csv('modified_players_data.csv', index=False)
files.download('modified_players_data.csv')