This is an updated title- Sui
Just a test boi
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Set the pathname for the csv file
auto_path = 'Automobile.csv'
# Create the a dataframe called 'auto' from the original csv and add the headers.
# Looking at the raw csv file, I could see it did not have headers, and it used '#' as a separator.
# Use the headings as provided in the assignment brief
auto = pd.read_csv(auto_path, sep = '#', names = ['symboling', 'normalized_losses', 'make', 'fuel_type',
'aspiration', 'num_of_doors', 'body_style', 'drive_wheels',
'engine_location', 'wheel_base', 'length', 'width',
'height', 'curb_weight', 'engine_type', 'num_of_cylinders',
'engine_size', 'fuel_system', 'bore', 'stroke',
'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg',
'highway_mpg', 'price'
])
auto.head()
auto.dtypes
auto.isna().sum()
auto.make = auto.make.str.strip().str.upper()
auto.fuel_type = auto.fuel_type.str.strip().str.upper()
auto.aspiration = auto.aspiration.str.strip().str.upper()
auto.num_of_doors = auto.num_of_doors.str.strip().str.upper()
auto.body_style = auto.body_style.str.strip().str.upper()
auto.drive_wheels = auto.drive_wheels.str.strip().str.upper()
auto.engine_location = auto.engine_location.str.strip().str.upper()
auto.engine_type = auto.engine_type.str.strip().str.upper()
auto.num_of_cylinders = auto.num_of_cylinders.str.strip().str.upper()
auto.fuel_system = auto.fuel_system.str.strip().str.upper()
# after we have just cleaned up all the string data.
auto.drop_duplicates(keep='first', inplace=True)
# Remember the possible values are -3, -2, -1, 0, 1, 2, 3
auto.symboling.value_counts()
auto[auto.symboling == 4]
auto.make.value_counts()
auto.make = auto.make.replace('VOL00112OV', 'VOLVO')
# Check the changes
auto.make.value_counts()
# Filter rows based on the 'make' of the rows above
auto[(auto.make == 'VOLVO') & (auto.aspiration == 'STD') & (auto.fuel_type == 'GAS')]
auto[(auto.make == 'VOLVO') & (auto.price == 16845)]
# However, this then makes the vehicle a duplicate except for the normalized losses value.
# We could discuss this further with the dataset owner to determine if the 25.0 in the normalised losses is
# incorrect and perhaps should be 95.0 and therefor a duplicate record.
# For the sake of this exercise, we will assume that they are 2 different models, one with a genuine normalized loss of 25.
# Change the normalized loss of 4 to -1
auto.symboling = auto.symboling.replace(4, -1)
# Check the changes
auto.symboling.value_counts()
auto.normalized_losses.describe()
auto.normalized_losses.plot(kind = 'hist', bins = 20)
plt.show()
# Let's have a look at these automobiles for more info.
auto[(auto.normalized_losses < 50) | (auto.normalized_losses > 200)]
# the low risk automobile. We will leave these point as they are
auto.fuel_type.value_counts()
auto.aspiration.value_counts()
auto.aspiration = auto.aspiration.replace('TURRRRBO', 'TURBO')
# Check the changes
auto.aspiration.value_counts()
auto.num_of_doors.value_counts()
auto.num_of_doors = auto.num_of_doors.replace('FOURR', 'FOUR')
# Check the changes
auto.num_of_doors.value_counts()
auto[auto.num_of_doors.isna()]
auto[(auto.make == 'DODGE') | (auto.make == 'MAZDA')]
# Update the 2 NaNs as FOUR
auto.num_of_doors = auto.num_of_doors.fillna('FOUR')
# Check the changes
auto.num_of_doors.isna().sum()
auto.body_style.value_counts()
auto.drive_wheels.value_counts()
auto.engine_location.value_counts()
auto.wheel_base.describe()
auto.wheel_base.plot(kind = 'hist', bins = 15)
plt.show()
auto.length.describe()
auto.length.plot(kind = 'hist', bins = 15)
plt.show()
auto.width.describe()
auto.width.plot(kind = 'hist', bins = 15)
plt.show()
auto.height.describe()
auto.height.plot(kind = 'hist', bins = 15)
plt.show()
auto.curb_weight.describe()
auto.curb_weight.plot(kind = 'hist', bins = 15)
plt.show()
auto.engine_type.value_counts()
auto.num_of_cylinders.value_counts()
# Let's have a better look at them and look at the engine size and characteristics to check
auto[(auto.num_of_cylinders == 'THREE') | (auto.num_of_cylinders == 'TWELVE')]
# We can also see that the 12 cyclinder has a very large engine, very high horsepower and very poor fuel consumption
# I am happy that these num_of_cylinder values are correct.
auto.engine_size.describe()
auto.engine_size.plot(kind = 'hist', bins = 15)
plt.show()
auto.fuel_system.value_counts()
auto.bore.describe()
auto.bore.plot(kind = 'hist', bins = 15)
plt.show()
auto.stroke.describe()
auto.stroke.plot(kind = 'hist', bins = 15)
plt.show()
auto.compression_ratio.describe()
auto.compression_ratio.plot(kind = 'hist', bins = 15)
plt.show()
# Though I do have a hypothesis I'd like to test
auto.horsepower.describe()
auto.horsepower.plot(kind = 'hist', bins = 15)
plt.show()
auto.peak_rpm.describe()
auto.peak_rpm.plot(kind = 'hist', bins = 15)
plt.show()
auto.city_mpg.describe()
auto.city_mpg.plot(kind = 'hist', bins = 10)
plt.show()
auto.highway_mpg.describe()
auto.highway_mpg.plot(kind = 'hist', bins = 15)
plt.show()
auto.price.describe()
auto.price.plot(kind = 'hist', bins = 50)
plt.show()
auto[auto.price == 0]
# Let's filter to show similar models
auto[auto.make == 'VOLVO']
auto.at[210,'price'] = 22470
auto.at[211,'price'] = 22625
auto.at[208,'normalized_losses'] = 95.0
auto.at[209,'normalized_losses'] = 95.0
# Now drop the duplicates
# We have basically just merged these rows
auto.drop_duplicates(keep='first', inplace=True)
# Check the changes
auto[auto.make == 'VOLVO']
auto.isnull().sum()
auto[(auto.bore.isnull()) | (auto.stroke.isnull()) | (auto.horsepower.isnull()) | (auto.peak_rpm.isnull() | (auto.price.isnull()))]
auto[auto.make == 'MAZDA']
sns.relplot(x='horsepower',
y='city_mpg',
alpha = 0.4,
data=auto,
color='mediumseagreen'
)
plt.show()
auto[(auto.city_mpg == 23)]
auto.horsepower.fillna(auto[auto.city_mpg == 23].loc[:,'horsepower'].median(), inplace=True)
auto.peak_rpm.fillna(auto[auto.city_mpg == 23].loc[:,'peak_rpm'].median(), inplace=True)
# Check the changes
auto[auto.city_mpg == 23]
# Note we wont worry too much about the normalized_losses as this can likely be calculated by a whole range
# of variables. It would be something we talk to the dataset owner about to get a better idea of how to
auto.isna().sum()
auto.fuel_type.str.capitalize().dropna().value_counts().plot(kind = 'pie',
autopct = '%0.1f%%',
cmap='Set2',
explode = (0,0.11),
shadow = True,
)
plt.title('Automobiles\nPercentage by Fuel Type', fontsize = 16, fontweight = 'bold')
plt.xlabel('')
plt.ylabel('')
plt.axis('equal')
plt.show()
auto.symboling.value_counts().plot.barh(cmap = 'Set2',
width = 0.9,
)
plt.title('Automobiles\nCount by Risk Level', fontsize = 16, fontweight = 'bold')
plt.xlabel('Automobiles', fontsize = 16)
plt.ylabel('Risk Level (Symboling)', fontsize = 16)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
auto.compression_ratio.plot(kind = 'hist',
bins = 30,
cmap = 'Set2',
)
plt.title('Automobiles\nFrequency of Compression Ratio', fontsize = 16, fontweight = 'bold')
plt.xlabel('Compression Ratio', fontsize = 16)
plt.ylabel('Frequency', fontsize = 16)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
sns.relplot(x='horsepower',
y='price',
alpha = 0.4,
data=auto,
color='mediumseagreen'
)
plt.title('Automobiles\nPrice Vs. Horsepower', fontsize = 16, fontweight = 'bold')
plt.xlabel('Horsepower', fontsize = 16)
plt.ylabel('Price', fontsize = 16)
plt.xticks(fontsize = 12)
plt.show()
sns.boxplot(x='symboling',
y='price',
data=auto,
width=0.2,
palette='Set2'
)
plt.title('Automobiles\nPrice by Risk Level', fontsize = 16, fontweight = 'bold')
plt.xlabel('Risk Level (Symboling)', fontsize = 16)
plt.ylabel('Price', fontsize = 16)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
# Create a strip plot of compression ratio vs fuel type
# Strip plots are useful if you want a scatterplot style plot but with categorical data on one axis
sns.stripplot(x='fuel_type',
y='compression_ratio',
data=auto, jitter=True,
alpha = 0.4,
palette = 'Set2'
)
plt.title('Automobiles\nCompression Ratio Vs Fuel Type', fontsize = 16, fontweight = 'bold')
plt.xlabel('Fuel Type', fontsize = 16)
plt.ylabel('Compression Ratio', fontsize = 16)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
from pandas.plotting import scatter_matrix
scatter_matrix(auto,
alpha = 0.7,
figsize = (30,30),
diagonal = 'hist',
color = 'mediumseagreen'
)
plt.show()