Task 1: Data Preparation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set the pathname for the csv file
auto_path = 'Automobile.csv'

# Create the a dataframe called 'auto' from the original csv and add the headers.
# Looking at the raw csv file, I could see it did not have headers, and it used '#' as a separator.
# Use the headings as provided in the assignment brief
auto = pd.read_csv(auto_path, sep = '#', names = ['symboling', 'normalized_losses', 'make', 'fuel_type', 
                                                  'aspiration', 'num_of_doors', 'body_style', 'drive_wheels',
                                                  'engine_location', 'wheel_base', 'length', 'width', 
                                                  'height', 'curb_weight', 'engine_type', 'num_of_cylinders', 
                                                  'engine_size', 'fuel_system', 'bore', 'stroke', 
                                                  'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg', 
                                                  'highway_mpg', 'price'
                                                 ])
auto.head()
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
0 3 NaN alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27 13495.0
1 3 NaN alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27 16500.0
2 1 NaN alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.0 154.0 5000.0 19 26 16500.0
3 2 164.0 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.40 10.0 102.0 5500.0 24 30 13950.0
4 2 164.0 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.40 8.0 115.0 5500.0 18 22 17450.0

5 rows × 26 columns

auto.dtypes
symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_of_doors          object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_of_cylinders      object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object
auto.isna().sum()
symboling             0
normalized_losses    47
make                  0
fuel_type             0
aspiration            0
num_of_doors          2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_of_cylinders      0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64
auto.make = auto.make.str.strip().str.upper()
auto.fuel_type = auto.fuel_type.str.strip().str.upper()
auto.aspiration = auto.aspiration.str.strip().str.upper()
auto.num_of_doors = auto.num_of_doors.str.strip().str.upper()
auto.body_style = auto.body_style.str.strip().str.upper()
auto.drive_wheels = auto.drive_wheels.str.strip().str.upper()
auto.engine_location = auto.engine_location.str.strip().str.upper()
auto.engine_type = auto.engine_type.str.strip().str.upper()
auto.num_of_cylinders = auto.num_of_cylinders.str.strip().str.upper()
auto.fuel_system = auto.fuel_system.str.strip().str.upper()
# after we have just cleaned up all the string data.
auto.drop_duplicates(keep='first', inplace=True)
# Remember the possible values are -3, -2, -1, 0, 1, 2, 3
auto.symboling.value_counts()
 0    67
 1    54
 2    32
 3    27
-1    26
-2     3
 4     1
Name: symboling, dtype: int64
auto[auto.symboling == 4]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
205 4 25.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0

1 rows × 26 columns

auto.make.value_counts()
TOYOTA           32
NISSAN           18
MAZDA            17
VOLVO            15
MITSUBISHI       13
HONDA            13
VOLKSWAGEN       12
SUBARU           12
PEUGOT           11
DODGE             9
BMW               8
MERCEDES-BENZ     8
PLYMOUTH          7
AUDI              7
SAAB              6
PORSCHE           5
ISUZU             4
CHEVROLET         3
ALFA-ROMERO       3
JAGUAR            3
RENAULT           2
VOL00112OV        1
MERCURY           1
Name: make, dtype: int64
auto.make = auto.make.replace('VOL00112OV', 'VOLVO')
# Check the changes
auto.make.value_counts()
TOYOTA           32
NISSAN           18
MAZDA            17
VOLVO            16
MITSUBISHI       13
HONDA            13
SUBARU           12
VOLKSWAGEN       12
PEUGOT           11
DODGE             9
BMW               8
MERCEDES-BENZ     8
PLYMOUTH          7
AUDI              7
SAAB              6
PORSCHE           5
ISUZU             4
CHEVROLET         3
ALFA-ROMERO       3
JAGUAR            3
RENAULT           2
MERCURY           1
Name: make, dtype: int64
# Filter rows based on the 'make' of the rows above
auto[(auto.make == 'VOLVO') & (auto.aspiration == 'STD') & (auto.fuel_type == 'GAS')]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
194 -2 103.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 12940.0
195 -1 74.0 VOLVO GAS STD FOUR WAGON RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 13415.0
196 -2 103.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 24 28 15985.0
197 -1 74.0 VOLVO GAS STD FOUR WAGON RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 24 28 16515.0
200 -1 95.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0
202 -1 95.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 173 MPFI 3.58 2.87 8.8 134.0 5500.0 18 23 21485.0
205 4 25.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0

7 rows × 26 columns

auto[(auto.make == 'VOLVO') & (auto.price == 16845)]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
200 -1 95.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0
205 4 25.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0

2 rows × 26 columns

# However, this then makes the vehicle a duplicate except for the normalized losses value.
# We could discuss this further with the dataset owner to determine if the 25.0 in the normalised losses is 
# incorrect and perhaps should be 95.0 and therefor a duplicate record.
# For the sake of this exercise, we will assume that they are 2 different models, one with a genuine normalized loss of 25.

# Change the normalized loss of 4 to -1
auto.symboling = auto.symboling.replace(4, -1)

# Check the changes
auto.symboling.value_counts()
 0    67
 1    54
 2    32
-1    27
 3    27
-2     3
Name: symboling, dtype: int64
auto.normalized_losses.describe()
count    167.000000
mean     121.095808
std       36.028977
min       25.000000
25%       94.000000
50%      115.000000
75%      150.000000
max      256.000000
Name: normalized_losses, dtype: float64
auto.normalized_losses.plot(kind = 'hist', bins = 20)
plt.show()
# Let's have a look at these automobiles for more info.
auto[(auto.normalized_losses < 50) | (auto.normalized_losses > 200)]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
106 1 231.0 NISSAN GAS STD TWO HATCHBACK RWD FRONT 99.2 ... 181 MPFI 3.43 3.27 9.0 160.0 5200.0 19 25 18399.0
190 3 256.0 VOLKSWAGEN GAS STD TWO HATCHBACK FWD FRONT 94.5 ... 109 MPFI 3.19 3.40 8.5 90.0 5500.0 24 29 9980.0
205 -1 25.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0

3 rows × 26 columns

# the low risk automobile. We will leave these point as they are
auto.fuel_type.value_counts()
GAS       188
DIESEL     22
Name: fuel_type, dtype: int64
auto.aspiration.value_counts()
STD         169
TURBO        40
TURRRRBO      1
Name: aspiration, dtype: int64
auto.aspiration = auto.aspiration.replace('TURRRRBO', 'TURBO')

# Check the changes
auto.aspiration.value_counts()
STD      169
TURBO     41
Name: aspiration, dtype: int64
auto.num_of_doors.value_counts()
FOUR     118
TWO       89
FOURR      1
Name: num_of_doors, dtype: int64
auto.num_of_doors = auto.num_of_doors.replace('FOURR', 'FOUR')

# Check the changes
auto.num_of_doors.value_counts()
FOUR    119
TWO      89
Name: num_of_doors, dtype: int64
auto[auto.num_of_doors.isna()]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
27 1 148.0 DODGE GAS TURBO NaN SEDAN FWD FRONT 93.7 ... 98 MPFI 3.03 3.39 7.6 102.0 5500.0 24 30 8558.0
63 0 NaN MAZDA DIESEL STD NaN SEDAN FWD FRONT 98.8 ... 122 IDI 3.39 3.39 22.7 64.0 4650.0 36 42 10795.0

2 rows × 26 columns

auto[(auto.make == 'DODGE') | (auto.make == 'MAZDA')]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
21 1 118.0 DODGE GAS STD TWO HATCHBACK FWD FRONT 93.7 ... 90 2BBL 2.97 3.23 9.41 68.0 5500.0 37 41 5572.0
22 1 118.0 DODGE GAS STD TWO HATCHBACK FWD FRONT 93.7 ... 90 2BBL 2.97 3.23 9.40 68.0 5500.0 31 38 6377.0
23 1 118.0 DODGE GAS TURBO TWO HATCHBACK FWD FRONT 93.7 ... 98 MPFI 3.03 3.39 7.60 102.0 5500.0 24 30 7957.0
24 1 148.0 DODGE GAS STD FOUR HATCHBACK FWD FRONT 93.7 ... 90 2BBL 2.97 3.23 9.40 68.0 5500.0 31 38 6229.0
25 1 148.0 DODGE GAS STD FOUR SEDAN FWD FRONT 93.7 ... 90 2BBL 2.97 3.23 9.40 68.0 5500.0 31 38 6692.0
26 1 148.0 DODGE GAS STD FOUR SEDAN FWD FRONT 93.7 ... 90 2BBL 2.97 3.23 9.40 68.0 5500.0 31 38 7609.0
27 1 148.0 DODGE GAS TURBO NaN SEDAN FWD FRONT 93.7 ... 98 MPFI 3.03 3.39 7.60 102.0 5500.0 24 30 8558.0
28 -1 110.0 DODGE GAS STD FOUR WAGON FWD FRONT 103.3 ... 122 2BBL 3.34 3.46 8.50 88.0 5000.0 24 30 8921.0
29 3 145.0 DODGE GAS TURBO TWO HATCHBACK FWD FRONT 95.9 ... 156 MFI 3.60 3.90 7.00 145.0 5000.0 19 24 12964.0
50 1 104.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 93.1 ... 91 2BBL 3.03 3.15 9.00 68.0 5000.0 30 31 5195.0
51 1 104.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 93.1 ... 91 2BBL 3.03 3.15 9.00 68.0 5000.0 31 38 6095.0
52 1 104.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 93.1 ... 91 2BBL 3.03 3.15 9.00 68.0 5000.0 31 38 6795.0
53 1 113.0 MAZDA GAS STD FOUR SEDAN FWD FRONT 93.1 ... 91 2BBL 3.03 3.15 9.00 68.0 5000.0 31 38 6695.0
54 1 113.0 MAZDA GAS STD FOUR SEDAN FWD FRONT 93.1 ... 91 2BBL 3.08 3.15 9.00 68.0 5000.0 31 38 7395.0
55 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 70 4BBL NaN NaN 9.40 101.0 6000.0 17 23 10945.0
56 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 70 4BBL NaN NaN 9.40 101.0 6000.0 17 23 11845.0
57 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 70 4BBL NaN NaN 9.40 101.0 6000.0 17 23 13645.0
58 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 80 MPFI NaN NaN 9.40 135.0 6000.0 16 23 15645.0
59 1 129.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.60 84.0 4800.0 26 32 8845.0
60 0 115.0 MAZDA GAS STD FOUR SEDAN FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.60 84.0 4800.0 26 32 8495.0
61 1 129.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.60 84.0 4800.0 26 32 10595.0
62 0 115.0 MAZDA GAS STD FOUR SEDAN FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.60 84.0 4800.0 26 32 10245.0
63 0 NaN MAZDA DIESEL STD NaN SEDAN FWD FRONT 98.8 ... 122 IDI 3.39 3.39 22.70 64.0 4650.0 36 42 10795.0
64 0 115.0 MAZDA GAS STD FOUR HATCHBACK FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.60 84.0 4800.0 26 32 11245.0
65 0 118.0 MAZDA GAS STD FOUR SEDAN RWD FRONT 104.9 ... 140 MPFI 3.76 3.16 8.00 120.0 5000.0 19 27 18280.0
66 0 NaN MAZDA DIESEL STD FOUR SEDAN RWD FRONT 104.9 ... 134 IDI 3.43 3.64 22.00 72.0 4200.0 31 39 18344.0

26 rows × 26 columns

# Update the 2 NaNs as FOUR
auto.num_of_doors = auto.num_of_doors.fillna('FOUR')

# Check the changes
auto.num_of_doors.isna().sum()
0
auto.body_style.value_counts()
SEDAN          101
HATCHBACK       70
WAGON           25
HARDTOP          8
CONVERTIBLE      6
Name: body_style, dtype: int64
auto.drive_wheels.value_counts()
FWD    120
RWD     81
4WD      9
Name: drive_wheels, dtype: int64
auto.engine_location.value_counts()
FRONT    207
REAR       3
Name: engine_location, dtype: int64
auto.wheel_base.describe()
count    210.000000
mean      99.002857
std        6.155714
min       86.600000
25%       94.500000
50%       97.000000
75%      102.400000
max      120.900000
Name: wheel_base, dtype: float64
auto.wheel_base.plot(kind = 'hist', bins = 15)
plt.show()
auto.length.describe()
count    210.000000
mean     174.400476
std       12.395514
min      141.100000
25%      166.800000
50%      173.200000
75%      184.600000
max      208.100000
Name: length, dtype: float64
auto.length.plot(kind = 'hist', bins = 15)
plt.show()
auto.width.describe()
count    210.000000
mean      65.979048
std        2.168156
min       60.300000
25%       64.125000
50%       65.500000
75%       66.900000
max       72.300000
Name: width, dtype: float64
auto.width.plot(kind = 'hist', bins = 15)
plt.show()
auto.height.describe()
count    210.000000
mean      53.767143
std        2.429310
min       47.800000
25%       52.000000
50%       54.100000
75%       55.500000
max       59.800000
Name: height, dtype: float64
auto.height.plot(kind = 'hist', bins = 15)
plt.show()
auto.curb_weight.describe()
count     210.000000
mean     2568.576190
std       521.387371
min      1488.000000
25%      2174.250000
50%      2434.000000
75%      2969.750000
max      4066.000000
Name: curb_weight, dtype: float64
auto.curb_weight.plot(kind = 'hist', bins = 15)
plt.show()
auto.engine_type.value_counts()
OHC      153
OHCF      15
OHCV      13
L         12
DOHC      12
ROTOR      4
DOHCV      1
Name: engine_type, dtype: int64
auto.num_of_cylinders.value_counts()
FOUR      162
SIX        26
FIVE       11
EIGHT       5
TWO         4
THREE       1
TWELVE      1
Name: num_of_cylinders, dtype: int64
# Let's have a better look at them and look at the engine size and characteristics to check
auto[(auto.num_of_cylinders == 'THREE') | (auto.num_of_cylinders == 'TWELVE')]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
18 2 121.0 CHEVROLET GAS STD TWO HATCHBACK FWD FRONT 88.4 ... 61 2BBL 2.91 3.03 9.5 48.0 5100.0 47 53 5151.0
49 0 NaN JAGUAR GAS STD TWO SEDAN RWD FRONT 102.0 ... 326 MPFI 3.54 2.76 11.5 262.0 5000.0 13 17 36000.0

2 rows × 26 columns

# We can also see that the 12 cyclinder has a very large engine, very high horsepower and very poor fuel consumption
# I am happy that these num_of_cylinder values are correct.
auto.engine_size.describe()
count    210.000000
mean     127.280952
std       41.212509
min       61.000000
25%       98.000000
50%      120.000000
75%      144.000000
max      326.000000
Name: engine_size, dtype: float64
auto.engine_size.plot(kind = 'hist', bins = 15)
plt.show()
auto.fuel_system.value_counts()
MPFI    97
2BBL    66
IDI     22
1BBL    11
SPDI     9
4BBL     3
SPFI     1
MFI      1
Name: fuel_system, dtype: int64
auto.bore.describe()
count    206.000000
mean       3.333204
std        0.277400
min        2.540000
25%        3.150000
50%        3.310000
75%        3.590000
max        3.940000
Name: bore, dtype: float64
auto.bore.plot(kind = 'hist', bins = 15)
plt.show()
auto.stroke.describe()
count    206.000000
mean       3.255291
std        0.313417
min        2.070000
25%        3.110000
50%        3.290000
75%        3.410000
max        4.170000
Name: stroke, dtype: float64
auto.stroke.plot(kind = 'hist', bins = 15)
plt.show()
auto.compression_ratio.describe()
count    210.00000
mean      10.25581
std        4.12003
min        7.00000
25%        8.60000
50%        9.00000
75%        9.40000
max       23.00000
Name: compression_ratio, dtype: float64
auto.compression_ratio.plot(kind = 'hist', bins = 15)
plt.show()
# Though I do have a hypothesis I'd like to test
auto.horsepower.describe()
count    208.000000
mean     104.413462
std       39.249385
min       48.000000
25%       70.000000
50%       96.000000
75%      116.000000
max      288.000000
Name: horsepower, dtype: float64
auto.horsepower.plot(kind = 'hist', bins = 15)
plt.show()
auto.peak_rpm.describe()
count     208.000000
mean     5126.201923
std       475.738427
min      4150.000000
25%      4800.000000
50%      5200.000000
75%      5500.000000
max      6600.000000
Name: peak_rpm, dtype: float64
auto.peak_rpm.plot(kind = 'hist', bins = 15)
plt.show()
auto.city_mpg.describe()
count    210.000000
mean      25.157143
std        6.493948
min       13.000000
25%       19.000000
50%       24.000000
75%       30.000000
max       49.000000
Name: city_mpg, dtype: float64
auto.city_mpg.plot(kind = 'hist', bins = 10)
plt.show()
auto.highway_mpg.describe()
count    210.000000
mean      30.647619
std        6.838508
min       16.000000
25%       25.000000
50%       30.000000
75%       34.000000
max       54.000000
Name: highway_mpg, dtype: float64
auto.highway_mpg.plot(kind = 'hist', bins = 15)
plt.show()
auto.price.describe()
count      206.000000
mean     13187.247573
std       8014.523084
min          0.000000
25%       7775.000000
50%      10320.000000
75%      16512.000000
max      45400.000000
Name: price, dtype: float64
auto.price.plot(kind = 'hist', bins = 50)
plt.show()
auto[auto.price == 0]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
210 -1 95.0 VOLVO DIESEL TURBO FOUR SEDAN RWD FRONT 109.1 ... 145 IDI 3.01 3.40 23.0 106.0 4800.0 26 27 0.0
211 -1 95.0 VOLVO GAS TURBO FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 19 25 0.0

2 rows × 26 columns

# Let's filter to show similar models
auto[auto.make == 'VOLVO']
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
194 -2 103.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 12940.0
195 -1 74.0 VOLVO GAS STD FOUR WAGON RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 13415.0
196 -2 103.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 24 28 15985.0
197 -1 74.0 VOLVO GAS STD FOUR WAGON RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 24 28 16515.0
198 -2 103.0 VOLVO GAS TURBO FOUR SEDAN RWD FRONT 104.3 ... 130 MPFI 3.62 3.15 7.5 162.0 5100.0 17 22 18420.0
199 -1 74.0 VOLVO GAS TURBO FOUR WAGON RWD FRONT 104.3 ... 130 MPFI 3.62 3.15 7.5 162.0 5100.0 17 22 18950.0
200 -1 95.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0
201 -1 95.0 VOLVO GAS TURBO FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 8.7 160.0 5300.0 19 25 19045.0
202 -1 95.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 173 MPFI 3.58 2.87 8.8 134.0 5500.0 18 23 21485.0
203 -1 95.0 VOLVO DIESEL TURBO FOUR SEDAN RWD FRONT 109.1 ... 145 IDI 3.01 3.40 23.0 106.0 4800.0 26 27 22470.0
204 -1 95.0 VOLVO GAS TURBO FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 19 25 22625.0
205 -1 25.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0
208 -1 NaN VOLVO DIESEL TURBO FOUR SEDAN RWD FRONT 109.1 ... 145 IDI 3.01 3.40 23.0 106.0 4800.0 26 27 22470.0
209 -1 NaN VOLVO GAS TURBO FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 19 25 22625.0
210 -1 95.0 VOLVO DIESEL TURBO FOUR SEDAN RWD FRONT 109.1 ... 145 IDI 3.01 3.40 23.0 106.0 4800.0 26 27 0.0
211 -1 95.0 VOLVO GAS TURBO FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 19 25 0.0

16 rows × 26 columns

auto.at[210,'price'] = 22470
auto.at[211,'price'] = 22625
auto.at[208,'normalized_losses'] = 95.0
auto.at[209,'normalized_losses'] = 95.0

# Now drop the duplicates
# We have basically just merged these rows
auto.drop_duplicates(keep='first', inplace=True)

# Check the changes
auto[auto.make == 'VOLVO']
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
194 -2 103.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 12940.0
195 -1 74.0 VOLVO GAS STD FOUR WAGON RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 13415.0
196 -2 103.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 24 28 15985.0
197 -1 74.0 VOLVO GAS STD FOUR WAGON RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 24 28 16515.0
198 -2 103.0 VOLVO GAS TURBO FOUR SEDAN RWD FRONT 104.3 ... 130 MPFI 3.62 3.15 7.5 162.0 5100.0 17 22 18420.0
199 -1 74.0 VOLVO GAS TURBO FOUR WAGON RWD FRONT 104.3 ... 130 MPFI 3.62 3.15 7.5 162.0 5100.0 17 22 18950.0
200 -1 95.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0
201 -1 95.0 VOLVO GAS TURBO FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 8.7 160.0 5300.0 19 25 19045.0
202 -1 95.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 173 MPFI 3.58 2.87 8.8 134.0 5500.0 18 23 21485.0
203 -1 95.0 VOLVO DIESEL TURBO FOUR SEDAN RWD FRONT 109.1 ... 145 IDI 3.01 3.40 23.0 106.0 4800.0 26 27 22470.0
204 -1 95.0 VOLVO GAS TURBO FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 19 25 22625.0
205 -1 25.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0

12 rows × 26 columns

auto.isnull().sum()
symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_of_doors          0
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_of_cylinders      0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64
auto[(auto.bore.isnull()) | (auto.stroke.isnull()) | (auto.horsepower.isnull()) | (auto.peak_rpm.isnull() | (auto.price.isnull()))]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
9 0 NaN AUDI GAS TURBO TWO HATCHBACK 4WD FRONT 99.5 ... 131 MPFI 3.13 3.40 7.0 160.0 5500.0 16 22 NaN
44 1 NaN ISUZU GAS STD TWO SEDAN FWD FRONT 94.5 ... 90 2BBL 3.03 3.11 9.6 70.0 5400.0 38 43 NaN
45 0 NaN ISUZU GAS STD FOUR SEDAN FWD FRONT 94.5 ... 90 2BBL 3.03 3.11 9.6 70.0 5400.0 38 43 NaN
55 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 70 4BBL NaN NaN 9.4 101.0 6000.0 17 23 10945.0
56 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 70 4BBL NaN NaN 9.4 101.0 6000.0 17 23 11845.0
57 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 70 4BBL NaN NaN 9.4 101.0 6000.0 17 23 13645.0
58 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 80 MPFI NaN NaN 9.4 135.0 6000.0 16 23 15645.0
129 1 NaN PORSCHE GAS STD TWO HATCHBACK RWD FRONT 98.4 ... 203 MPFI 3.94 3.11 10.0 288.0 5750.0 17 28 NaN
130 0 NaN RENAULT GAS STD FOUR WAGON FWD FRONT 96.1 ... 132 MPFI 3.46 3.90 8.7 NaN NaN 23 31 9295.0
131 2 NaN RENAULT GAS STD TWO HATCHBACK FWD FRONT 96.1 ... 132 MPFI 3.46 3.90 8.7 NaN NaN 23 31 9895.0

10 rows × 26 columns

auto[auto.make == 'MAZDA']
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
50 1 104.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 93.1 ... 91 2BBL 3.03 3.15 9.0 68.0 5000.0 30 31 5195.0
51 1 104.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 93.1 ... 91 2BBL 3.03 3.15 9.0 68.0 5000.0 31 38 6095.0
52 1 104.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 93.1 ... 91 2BBL 3.03 3.15 9.0 68.0 5000.0 31 38 6795.0
53 1 113.0 MAZDA GAS STD FOUR SEDAN FWD FRONT 93.1 ... 91 2BBL 3.03 3.15 9.0 68.0 5000.0 31 38 6695.0
54 1 113.0 MAZDA GAS STD FOUR SEDAN FWD FRONT 93.1 ... 91 2BBL 3.08 3.15 9.0 68.0 5000.0 31 38 7395.0
55 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 70 4BBL NaN NaN 9.4 101.0 6000.0 17 23 10945.0
56 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 70 4BBL NaN NaN 9.4 101.0 6000.0 17 23 11845.0
57 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 70 4BBL NaN NaN 9.4 101.0 6000.0 17 23 13645.0
58 3 150.0 MAZDA GAS STD TWO HATCHBACK RWD FRONT 95.3 ... 80 MPFI NaN NaN 9.4 135.0 6000.0 16 23 15645.0
59 1 129.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.6 84.0 4800.0 26 32 8845.0
60 0 115.0 MAZDA GAS STD FOUR SEDAN FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.6 84.0 4800.0 26 32 8495.0
61 1 129.0 MAZDA GAS STD TWO HATCHBACK FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.6 84.0 4800.0 26 32 10595.0
62 0 115.0 MAZDA GAS STD FOUR SEDAN FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.6 84.0 4800.0 26 32 10245.0
63 0 NaN MAZDA DIESEL STD FOUR SEDAN FWD FRONT 98.8 ... 122 IDI 3.39 3.39 22.7 64.0 4650.0 36 42 10795.0
64 0 115.0 MAZDA GAS STD FOUR HATCHBACK FWD FRONT 98.8 ... 122 2BBL 3.39 3.39 8.6 84.0 4800.0 26 32 11245.0
65 0 118.0 MAZDA GAS STD FOUR SEDAN RWD FRONT 104.9 ... 140 MPFI 3.76 3.16 8.0 120.0 5000.0 19 27 18280.0
66 0 NaN MAZDA DIESEL STD FOUR SEDAN RWD FRONT 104.9 ... 134 IDI 3.43 3.64 22.0 72.0 4200.0 31 39 18344.0

17 rows × 26 columns

 
sns.relplot(x='horsepower', 
            y='city_mpg', 
            alpha = 0.4, 
            data=auto,
            color='mediumseagreen'
            )
plt.show()
auto[(auto.city_mpg == 23)]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
10 2 192.0 BMW GAS STD TWO SEDAN RWD FRONT 101.2 ... 108 MPFI 3.50 2.80 8.8 101.0 5800.0 23 29 16430.0
11 0 192.0 BMW GAS STD FOUR SEDAN RWD FRONT 101.2 ... 108 MPFI 3.50 2.80 8.8 101.0 5800.0 23 29 16925.0
80 3 153.0 MITSUBISHI GAS TURBO TWO HATCHBACK FWD FRONT 96.3 ... 110 SPDI 3.17 3.46 7.5 116.0 5500.0 23 30 9959.0
87 1 125.0 MITSUBISHI GAS TURBO FOUR SEDAN FWD FRONT 96.3 ... 110 SPDI 3.17 3.46 7.5 116.0 5500.0 23 30 9279.0
88 -1 137.0 MITSUBISHI GAS STD FOUR SEDAN FWD FRONT 96.3 ... 110 SPDI 3.17 3.46 7.5 116.0 5500.0 23 30 9279.0
130 0 NaN RENAULT GAS STD FOUR WAGON FWD FRONT 96.1 ... 132 MPFI 3.46 3.90 8.7 NaN NaN 23 31 9295.0
131 2 NaN RENAULT GAS STD TWO HATCHBACK FWD FRONT 96.1 ... 132 MPFI 3.46 3.90 8.7 NaN NaN 23 31 9895.0
148 0 85.0 SUBARU GAS STD FOUR WAGON 4WD FRONT 96.9 ... 108 2BBL 3.62 2.64 9.0 82.0 4800.0 23 29 8013.0
149 0 85.0 SUBARU GAS TURBO FOUR WAGON 4WD FRONT 96.9 ... 108 MPFI 3.62 2.64 7.7 111.0 4800.0 23 23 11694.0
194 -2 103.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 12940.0
195 -1 74.0 VOLVO GAS STD FOUR WAGON RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 13415.0
200 -1 95.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0
205 -1 25.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0

13 rows × 26 columns

auto.horsepower.fillna(auto[auto.city_mpg == 23].loc[:,'horsepower'].median(), inplace=True)
auto.peak_rpm.fillna(auto[auto.city_mpg == 23].loc[:,'peak_rpm'].median(), inplace=True)

# Check the changes
auto[auto.city_mpg == 23]
symboling normalized_losses make fuel_type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
10 2 192.0 BMW GAS STD TWO SEDAN RWD FRONT 101.2 ... 108 MPFI 3.50 2.80 8.8 101.0 5800.0 23 29 16430.0
11 0 192.0 BMW GAS STD FOUR SEDAN RWD FRONT 101.2 ... 108 MPFI 3.50 2.80 8.8 101.0 5800.0 23 29 16925.0
80 3 153.0 MITSUBISHI GAS TURBO TWO HATCHBACK FWD FRONT 96.3 ... 110 SPDI 3.17 3.46 7.5 116.0 5500.0 23 30 9959.0
87 1 125.0 MITSUBISHI GAS TURBO FOUR SEDAN FWD FRONT 96.3 ... 110 SPDI 3.17 3.46 7.5 116.0 5500.0 23 30 9279.0
88 -1 137.0 MITSUBISHI GAS STD FOUR SEDAN FWD FRONT 96.3 ... 110 SPDI 3.17 3.46 7.5 116.0 5500.0 23 30 9279.0
130 0 NaN RENAULT GAS STD FOUR WAGON FWD FRONT 96.1 ... 132 MPFI 3.46 3.90 8.7 114.0 5400.0 23 31 9295.0
131 2 NaN RENAULT GAS STD TWO HATCHBACK FWD FRONT 96.1 ... 132 MPFI 3.46 3.90 8.7 114.0 5400.0 23 31 9895.0
148 0 85.0 SUBARU GAS STD FOUR WAGON 4WD FRONT 96.9 ... 108 2BBL 3.62 2.64 9.0 82.0 4800.0 23 29 8013.0
149 0 85.0 SUBARU GAS TURBO FOUR WAGON 4WD FRONT 96.9 ... 108 MPFI 3.62 2.64 7.7 111.0 4800.0 23 23 11694.0
194 -2 103.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 12940.0
195 -1 74.0 VOLVO GAS STD FOUR WAGON RWD FRONT 104.3 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 13415.0
200 -1 95.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0
205 -1 25.0 VOLVO GAS STD FOUR SEDAN RWD FRONT 109.1 ... 141 MPFI 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0

13 rows × 26 columns

# Note we wont worry too much about the normalized_losses as this can likely be calculated by a whole range
# of variables. It would be something we talk to the dataset owner about to get a better idea of how to
auto.isna().sum()
symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_of_doors          0
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_of_cylinders      0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            0
peak_rpm              0
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

Task 2: Data Exploration

2.1

Categorical Variable: Fuel Type

auto.fuel_type.str.capitalize().dropna().value_counts().plot(kind = 'pie',
                                                             autopct = '%0.1f%%',
                                                             cmap='Set2',
                                                             explode = (0,0.11),
                                                             shadow = True,
                                                             )

plt.title('Automobiles\nPercentage by Fuel Type', fontsize = 16, fontweight = 'bold')
plt.xlabel('')
plt.ylabel('')
plt.axis('equal')
plt.show()

Ordinal Variable: Symboling

auto.symboling.value_counts().plot.barh(cmap = 'Set2',
                                        width = 0.9,
                                        )
plt.title('Automobiles\nCount by Risk Level', fontsize = 16, fontweight = 'bold')
plt.xlabel('Automobiles', fontsize = 16)
plt.ylabel('Risk Level (Symboling)', fontsize = 16)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()

Numerical Variable: Compression Ratio

auto.compression_ratio.plot(kind = 'hist',
                            bins = 30,
                            cmap = 'Set2',
                            )
plt.title('Automobiles\nFrequency of Compression Ratio', fontsize = 16, fontweight = 'bold')
plt.xlabel('Compression Ratio', fontsize = 16)
plt.ylabel('Frequency', fontsize = 16)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()

Task 2.2

sns.relplot(x='horsepower', 
            y='price', 
            alpha = 0.4, 
            data=auto,
            color='mediumseagreen'
            )
plt.title('Automobiles\nPrice Vs. Horsepower', fontsize = 16, fontweight = 'bold')
plt.xlabel('Horsepower', fontsize = 16)
plt.ylabel('Price', fontsize = 16)
plt.xticks(fontsize = 12)
plt.show()
sns.boxplot(x='symboling', 
            y='price', 
            data=auto, 
            width=0.2, 
            palette='Set2'
            )
plt.title('Automobiles\nPrice by Risk Level', fontsize = 16, fontweight = 'bold')
plt.xlabel('Risk Level (Symboling)', fontsize = 16)
plt.ylabel('Price', fontsize = 16)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
# Create a strip plot of compression ratio vs fuel type
# Strip plots are useful if you want a scatterplot style plot but with categorical data on one axis
sns.stripplot(x='fuel_type', 
              y='compression_ratio', 
              data=auto, jitter=True,
              alpha = 0.4, 
              palette = 'Set2'
              )
plt.title('Automobiles\nCompression Ratio Vs Fuel Type', fontsize = 16, fontweight = 'bold')
plt.xlabel('Fuel Type', fontsize = 16)
plt.ylabel('Compression Ratio', fontsize = 16)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()

Task 2.3

from pandas.plotting import scatter_matrix

scatter_matrix(auto, 
               alpha = 0.7, 
               figsize = (30,30), 
               diagonal = 'hist',
               color = 'mediumseagreen'
               )
plt.show()