In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import metrics, svm
from sklearn import utils
import random
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
In [2]:
data=pd.read_csv("mammographic.csv")
In [3]:
data.head()
Out[3]:
5 67 3 5.1 3.1 yes
0 4 43 1 1 ? yes
1 5 58 4 5 3 yes
2 4 28 1 1 3 no
3 5 74 1 5 ? yes
4 4 65 1 ? 3 no
In [4]:
data.tail()
Out[4]:
5 67 3 5.1 3.1 yes
955 4 47 2 1 3 no
956 4 56 4 5 3 yes
957 4 64 4 5 3 no
958 5 66 4 5 3 yes
959 4 62 3 3 3 no
In [5]:
colnames=['BI-RADS','AGE','SHAPE','MARGIN','DENSITY','SEVERITY']
In [6]:
data1=pd.read_csv("mammographic.csv",names=colnames,header = None)
In [7]:
data1.head()
Out[7]:
BI-RADS AGE SHAPE MARGIN DENSITY SEVERITY
0 5 67 3 5 3 yes
1 4 43 1 1 ? yes
2 5 58 4 5 3 yes
3 4 28 1 1 3 no
4 5 74 1 5 ? yes
In [8]:
data1.tail()
Out[8]:
BI-RADS AGE SHAPE MARGIN DENSITY SEVERITY
956 4 47 2 1 3 no
957 4 56 4 5 3 yes
958 4 64 4 5 3 no
959 5 66 4 5 3 yes
960 4 62 3 3 3 no
In [9]:
na=['?']
df=pd.read_csv('mammographic.csv', na_values=na, names=colnames, header= None)
In [10]:
df.head()
Out[10]:
BI-RADS AGE SHAPE MARGIN DENSITY SEVERITY
0 5.0 67.0 3.0 5.0 3.0 yes
1 4.0 43.0 1.0 1.0 NaN yes
2 5.0 58.0 4.0 5.0 3.0 yes
3 4.0 28.0 1.0 1.0 3.0 no
4 5.0 74.0 1.0 5.0 NaN yes
In [11]:
print(df.isnull().sum())
BI-RADS      2
AGE          5
SHAPE       31
MARGIN      48
DENSITY     76
SEVERITY     0
dtype: int64

Above gives us the missing values. As we can observe the largest missing values is from the Density Variable and the Variable with no Missing values is 'SEVERITY'

In [12]:
missing_values=df.isnull().sum()
In [13]:
missing_values
Out[13]:
BI-RADS      2
AGE          5
SHAPE       31
MARGIN      48
DENSITY     76
SEVERITY     0
dtype: int64
In [14]:
df_clean=df.dropna()
In [15]:
df_clean.isnull().sum()
Out[15]:
BI-RADS     0
AGE         0
SHAPE       0
MARGIN      0
DENSITY     0
SEVERITY    0
dtype: int64
In [16]:
df.isnull().sum()
Out[16]:
BI-RADS      2
AGE          5
SHAPE       31
MARGIN      48
DENSITY     76
SEVERITY     0
dtype: int64
In [17]:
df2=df.drop(['BI-RADS'], axis=1)
df2.head()
Out[17]:
AGE SHAPE MARGIN DENSITY SEVERITY
0 67.0 3.0 5.0 3.0 yes
1 43.0 1.0 1.0 NaN yes
2 58.0 4.0 5.0 3.0 yes
3 28.0 1.0 1.0 3.0 no
4 74.0 1.0 5.0 NaN yes
In [18]:
df_clean=df2.dropna()
In [19]:
df_clean.head()
Out[19]:
AGE SHAPE MARGIN DENSITY SEVERITY
0 67.0 3.0 5.0 3.0 yes
2 58.0 4.0 5.0 3.0 yes
3 28.0 1.0 1.0 3.0 no
8 57.0 1.0 5.0 3.0 yes
10 76.0 1.0 4.0 3.0 yes

We now have to transform 'SEVERITY' column to numerical data. 1 for 'yes' and 0 for 'no'.

In [20]:
df_clean.head()
Out[20]:
AGE SHAPE MARGIN DENSITY SEVERITY
0 67.0 3.0 5.0 3.0 yes
2 58.0 4.0 5.0 3.0 yes
3 28.0 1.0 1.0 3.0 no
8 57.0 1.0 5.0 3.0 yes
10 76.0 1.0 4.0 3.0 yes
In [21]:
df3=pd.get_dummies(df_clean['SEVERITY'])
#df_clean=df2.join(pd.get_dummies(df2.pop('SEVERITY')))
In [22]:
df3.head()
#df2.head()
Out[22]:
no yes
0 0 1
2 0 1
3 1 0
8 0 1
10 0 1
In [23]:
df3.pop('no')
df3.head()
Out[23]:
yes
0 1
2 1
3 0
8 1
10 1
In [24]:
df_new=pd.concat([df_clean,df3],sort=False,axis=1)
In [25]:
df_new.head()
Out[25]:
AGE SHAPE MARGIN DENSITY SEVERITY yes
0 67.0 3.0 5.0 3.0 yes 1
2 58.0 4.0 5.0 3.0 yes 1
3 28.0 1.0 1.0 3.0 no 0
8 57.0 1.0 5.0 3.0 yes 1
10 76.0 1.0 4.0 3.0 yes 1
In [26]:
df_new = df_new.drop(['SEVERITY'], axis=1)
In [27]:
df_new.head()
Out[27]:
AGE SHAPE MARGIN DENSITY yes
0 67.0 3.0 5.0 3.0 1
2 58.0 4.0 5.0 3.0 1
3 28.0 1.0 1.0 3.0 0
8 57.0 1.0 5.0 3.0 1
10 76.0 1.0 4.0 3.0 1
In [28]:
df_new=df_new.rename(columns={'yes':'SEVERITY'})
In [29]:
df_new.head()
Out[29]:
AGE SHAPE MARGIN DENSITY SEVERITY
0 67.0 3.0 5.0 3.0 1
2 58.0 4.0 5.0 3.0 1
3 28.0 1.0 1.0 3.0 0
8 57.0 1.0 5.0 3.0 1
10 76.0 1.0 4.0 3.0 1

Our new data set has its catagorical variable transformed into a numerical. I have also droped all the null values from the dataset. Now lets see if we need to normalize our data set.

In [30]:
df_new.count()
Out[30]:
AGE         831
SHAPE       831
MARGIN      831
DENSITY     831
SEVERITY    831
dtype: int64
In [31]:
df_new.describe()
Out[31]:
AGE SHAPE MARGIN DENSITY SEVERITY
count 831.000000 831.000000 831.000000 831.000000 831.000000
mean 55.777377 2.783394 2.814681 2.915764 0.484958
std 14.663528 1.242331 1.566771 0.350737 0.500075
min 18.000000 1.000000 1.000000 1.000000 0.000000
25% 46.000000 2.000000 1.000000 3.000000 0.000000
50% 57.000000 3.000000 3.000000 3.000000 0.000000
75% 66.000000 4.000000 4.000000 3.000000 1.000000
max 96.000000 4.000000 5.000000 4.000000 1.000000

Above is the basic statistics of our dataset. This gives us a quick overview of the shape or the data

In [32]:
sns.set()
_ = plt.hist(df_new['AGE'])
_ = plt.xlabel('Age group')
_ = plt.ylabel('Distribution')
plt.show()

Above is an observation of our age group distribution, the largest age group in our dataset is in the 60 to 70 age group.

In [33]:
sns.set()
_ = plt.hist(df_new['SHAPE'],)
_ = plt.xlabel('SHAPE Group')
_ = plt.ylabel('Distribution')
plt.show()

The 'SHAPE' is a variable ordered from round to irregular. SHAPE { 1: ROUND, 2: OVAL, 3: LOBULAR, 4: IRREGULAR }. As we can observe Irregular 'SHAPE' has the largest distribution on our dataset.

In [34]:
sns.set()
_ = plt.hist(df_new['MARGIN'])
_ = plt.xlabel('MARGIN Group')
_ = plt.ylabel('Distribution')
plt.show()

The above distrubution visualize the attribute 'MARGIN':{1:circumscribed, 2:microlobulated, 3:obscured', 4:ill-defined, 5:spiculated}. From the distribution we also observe that the circumscribed values have the largest distrubutions.

In [35]:
sns.set()
_ = plt.hist(df_new['DENSITY'])
_ = plt.xlabel('DENSITY Group')
_ = plt.ylabel('Distribution')
plt.show()

The above plot describes the distribution of 'DENSITY' on our dataset. DENSITY: {1:high, 2:iso,3:low, 4: fat-containing}. From the distribution the largest category observed on our dataset is 3.0 which is the low Mass Density.

In [36]:
df_new['AGE'].hist(by=df_new['SEVERITY'],bins=20, figsize=(20,7))
Out[36]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x000002A32BC03EB8>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000002A32BBD2160>],
      dtype=object)

The Above figure is the distribution of our dataset AGE group with SEVERITY 1 and 0. We can observe that younger people are more likely to be zero or 'NO' values on SEVERITY than 1/'yes' value.

In [37]:
df_new['MARGIN'].hist(by=df_new['SEVERITY'],bins=20, figsize=(20,7))
Out[37]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x000002A32C0599B0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000002A32C0958D0>],
      dtype=object)
In [ ]:
 
In [38]:
df_new['SHAPE'].hist(by=df_new['SEVERITY'],bins=20, figsize=(20,7))
Out[38]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x000002A32C170CF8>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000002A32C1B2080>],
      dtype=object)

Now we have to standardize our dataset. StandardScaler standardizes a feature by subtracting the mean and then scaling to unit variance. Unit variance means dividing all the values by the standard deviation. StandardScaler results in a distribution with a standard deviation equal to 1.

In [39]:
standardscaler = preprocessing.StandardScaler()
df_scaler= standardscaler.fit_transform(df_new)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:645: DataConversionWarning: Data with input dtype uint8, float64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:464: DataConversionWarning: Data with input dtype uint8, float64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
In [52]:
scaled_features_df = pd.DataFrame(df_scaler, index=df_new.index, columns=df_new.columns)

Above is the our scaled dataset that has been transformed using StandardScaler.

Now we have to split our dataset into train/test data set. Where 75% for training and 25% for Testing.

In [70]:
y = scaled_features_df['SEVERITY']
attribute_columns=['AGE','SHAPE','MARGIN','DENSITY']
x = scaled_features_df[attribute_columns]

Above we 'y' defines the target variable and split our dataset in feature/attribute and target variable.

In [101]:
#Creating a training and testing vars
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
In [102]:
#Check the shape of train 
print(x_train.shape, y_train.shape)
(623, 4) (623,)
In [103]:
#check the shape of test
print(x_test.shape, y_test.shape)
(208, 4) (208,)

Below is the creation of Decision Tree Model

In [118]:
#we have to convert our target variable into an int since the classifier expects an int object and not a float
lab_enc = preprocessing.LabelEncoder()
y_train_encoded = lab_enc.fit_transform(y_train)
y_test_encoded = lab_enc.fit_transform(y_test)
#Decision Tree classifier
clf = DecisionTreeClassifier()
In [119]:
#TRain Decision Tree
clf = clf.fit(x_train,y_train_encoded)
In [124]:
#Predict response
y_pred = clf.predict(x_test)
In [125]:
#Evaluate the model
print("Accuracy:", metrics.accuracy_score(y_test_encoded,y_pred))
Accuracy: 0.7067307692307693

Decision Tree Classifier has got 70.67% accuracy. Which is a good accuracy.

The following will utilize RandromForest Classifier for prediction

In [152]:
clf = RandomForestClassifier(n_estimators=100)
In [153]:
#Train model
clf.fit(x_train,y_train_encoded)
Out[153]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [155]:
#predict
y_pred = clf.predict(x_test)
In [156]:
#Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test_encoded,y_pred))
Accuracy: 0.7548076923076923

RandomForest classifier has produced 75.48% accuracy

The following will utilize the KNN classifier

In [158]:
#KNN classifier created
knn = KNeighborsClassifier(n_neighbors=5)
In [159]:
#Train Model
knn.fit(x_train,y_train_encoded)
Out[159]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
In [160]:
#predict 
y_pred=knn.predict(x_test)
In [161]:
#Model accuracy
print("Accuracy:", metrics.accuracy_score(y_test_encoded,y_pred))
Accuracy: 0.7932692307692307

KNN has produced 79.32% accuracy.

The following will utilize Naive Bayes classifier.

In [3]:
#Gaussian classifier created
gnb = GaussianNB()
In [4]:
#Model Trained
gnb.fit(x_train,y_train_encoded)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-22e4d169b994> in <module>
      1 #Model Trained
----> 2 gnb.fit(x_train,y_train_encoded)

NameError: name 'x_train' is not defined
In [5]:
#predict
y_pred=gnb.predict(x_test)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-8bf8afb15a4c> in <module>
      1 #predict
----> 2 y_pred=gnb.predict(x_test)

NameError: name 'x_test' is not defined
In [2]:
#Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test_encoded,y_pred))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-5e64f4b65919> in <module>
      1 #Model accuracy
----> 2 print("Accuracy:",metrics.accuracy_score(y_test_encoded,y_pred))

NameError: name 'y_test_encoded' is not defined

Gaussian Classifier has produced 79.807 Accuracy ~ 80%

In [ ]: