import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import metrics, svm
from sklearn import utils
import random
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

data=pd.read_csv("mammographic.csv")

data.head()

data.tail()

colnames=['BI-RADS','AGE','SHAPE','MARGIN','DENSITY','SEVERITY']

data1=pd.read_csv("mammographic.csv",names=colnames,header = None)

data1.head()

data1.tail()

na=['?']
df=pd.read_csv('mammographic.csv', na_values=na, names=colnames, header= None)

df.head()

print(df.isnull().sum())

BI-RADS      2
AGE          5
SHAPE       31
MARGIN      48
DENSITY     76
SEVERITY     0
dtype: int64

Above gives us the missing values. As we can observe the largest missing values is from the Density Variable and the Variable with no Missing values is 'SEVERITY'

missing_values=df.isnull().sum()

missing_values

BI-RADS      2
AGE          5
SHAPE       31
MARGIN      48
DENSITY     76
SEVERITY     0
dtype: int64

df_clean=df.dropna()

df_clean.isnull().sum()

BI-RADS     0
AGE         0
SHAPE       0
MARGIN      0
DENSITY     0
SEVERITY    0
dtype: int64

df.isnull().sum()

BI-RADS      2
AGE          5
SHAPE       31
MARGIN      48
DENSITY     76
SEVERITY     0
dtype: int64

df2=df.drop(['BI-RADS'], axis=1)
df2.head()

df_clean=df2.dropna()

df_clean.head()

We now have to transform 'SEVERITY' column to numerical data. 1 for 'yes' and 0 for 'no'.

df_clean.head()

df3=pd.get_dummies(df_clean['SEVERITY'])
#df_clean=df2.join(pd.get_dummies(df2.pop('SEVERITY')))

df3.head()
#df2.head()

df3.pop('no')
df3.head()

df_new=pd.concat([df_clean,df3],sort=False,axis=1)

df_new.head()

df_new = df_new.drop(['SEVERITY'], axis=1)

df_new.head()

df_new=df_new.rename(columns={'yes':'SEVERITY'})

df_new.head()

Our new data set has its catagorical variable transformed into a numerical. I have also droped all the null values from the dataset. Now lets see if we need to normalize our data set.

df_new.count()

AGE         831
SHAPE       831
MARGIN      831
DENSITY     831
SEVERITY    831
dtype: int64

df_new.describe()

Above is the basic statistics of our dataset. This gives us a quick overview of the shape or the data

sns.set()
_ = plt.hist(df_new['AGE'])
_ = plt.xlabel('Age group')
_ = plt.ylabel('Distribution')
plt.show()

Above is an observation of our age group distribution, the largest age group in our dataset is in the 60 to 70 age group.

sns.set()
_ = plt.hist(df_new['SHAPE'],)
_ = plt.xlabel('SHAPE Group')
_ = plt.ylabel('Distribution')
plt.show()

The 'SHAPE' is a variable ordered from round to irregular. SHAPE { 1: ROUND, 2: OVAL, 3: LOBULAR, 4: IRREGULAR }. As we can observe Irregular 'SHAPE' has the largest distribution on our dataset.

sns.set()
_ = plt.hist(df_new['MARGIN'])
_ = plt.xlabel('MARGIN Group')
_ = plt.ylabel('Distribution')
plt.show()

The above distrubution visualize the attribute 'MARGIN':{1:circumscribed, 2:microlobulated, 3:obscured', 4:ill-defined, 5:spiculated}. From the distribution we also observe that the circumscribed values have the largest distrubutions.

sns.set()
_ = plt.hist(df_new['DENSITY'])
_ = plt.xlabel('DENSITY Group')
_ = plt.ylabel('Distribution')
plt.show()

The above plot describes the distribution of 'DENSITY' on our dataset. DENSITY: {1:high, 2:iso,3:low, 4: fat-containing}. From the distribution the largest category observed on our dataset is 3.0 which is the low Mass Density.

df_new['AGE'].hist(by=df_new['SEVERITY'],bins=20, figsize=(20,7))

array([<matplotlib.axes._subplots.AxesSubplot object at 0x000002A32BC03EB8>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000002A32BBD2160>],
      dtype=object)

The Above figure is the distribution of our dataset AGE group with SEVERITY 1 and 0. We can observe that younger people are more likely to be zero or 'NO' values on SEVERITY than 1/'yes' value.

df_new['MARGIN'].hist(by=df_new['SEVERITY'],bins=20, figsize=(20,7))

array([<matplotlib.axes._subplots.AxesSubplot object at 0x000002A32C0599B0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000002A32C0958D0>],
      dtype=object)

df_new['SHAPE'].hist(by=df_new['SEVERITY'],bins=20, figsize=(20,7))

array([<matplotlib.axes._subplots.AxesSubplot object at 0x000002A32C170CF8>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000002A32C1B2080>],
      dtype=object)

Now we have to standardize our dataset. StandardScaler standardizes a feature by subtracting the mean and then scaling to unit variance. Unit variance means dividing all the values by the standard deviation. StandardScaler results in a distribution with a standard deviation equal to 1.

standardscaler = preprocessing.StandardScaler()
df_scaler= standardscaler.fit_transform(df_new)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:645: DataConversionWarning: Data with input dtype uint8, float64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:464: DataConversionWarning: Data with input dtype uint8, float64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)

scaled_features_df = pd.DataFrame(df_scaler, index=df_new.index, columns=df_new.columns)

Above is the our scaled dataset that has been transformed using StandardScaler.

Now we have to split our dataset into train/test data set. Where 75% for training and 25% for Testing.

y = scaled_features_df['SEVERITY']
attribute_columns=['AGE','SHAPE','MARGIN','DENSITY']
x = scaled_features_df[attribute_columns]

Above we 'y' defines the target variable and split our dataset in feature/attribute and target variable.

#Creating a training and testing vars
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

#Check the shape of train 
print(x_train.shape, y_train.shape)

(623, 4) (623,)

#check the shape of test
print(x_test.shape, y_test.shape)

(208, 4) (208,)

Below is the creation of Decision Tree Model

#we have to convert our target variable into an int since the classifier expects an int object and not a float
lab_enc = preprocessing.LabelEncoder()
y_train_encoded = lab_enc.fit_transform(y_train)
y_test_encoded = lab_enc.fit_transform(y_test)
#Decision Tree classifier
clf = DecisionTreeClassifier()

#TRain Decision Tree
clf = clf.fit(x_train,y_train_encoded)

#Predict response
y_pred = clf.predict(x_test)

#Evaluate the model
print("Accuracy:", metrics.accuracy_score(y_test_encoded,y_pred))

Accuracy: 0.7067307692307693

Decision Tree Classifier has got 70.67% accuracy. Which is a good accuracy.

The following will utilize RandromForest Classifier for prediction

clf = RandomForestClassifier(n_estimators=100)

#Train model
clf.fit(x_train,y_train_encoded)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#predict
y_pred = clf.predict(x_test)

#Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test_encoded,y_pred))

Accuracy: 0.7548076923076923

RandomForest classifier has produced 75.48% accuracy

The following will utilize the KNN classifier

#KNN classifier created
knn = KNeighborsClassifier(n_neighbors=5)

#Train Model
knn.fit(x_train,y_train_encoded)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

#predict 
y_pred=knn.predict(x_test)

#Model accuracy
print("Accuracy:", metrics.accuracy_score(y_test_encoded,y_pred))

Accuracy: 0.7932692307692307

KNN has produced 79.32% accuracy.

The following will utilize Naive Bayes classifier.

#Gaussian classifier created
gnb = GaussianNB()

#Model Trained
gnb.fit(x_train,y_train_encoded)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-22e4d169b994> in <module>
      1 #Model Trained
----> 2 gnb.fit(x_train,y_train_encoded)

NameError: name 'x_train' is not defined

#predict
y_pred=gnb.predict(x_test)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-8bf8afb15a4c> in <module>
      1 #predict
----> 2 y_pred=gnb.predict(x_test)

NameError: name 'x_test' is not defined

#Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test_encoded,y_pred))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-5e64f4b65919> in <module>
      1 #Model accuracy
----> 2 print("Accuracy:",metrics.accuracy_score(y_test_encoded,y_pred))

NameError: name 'y_test_encoded' is not defined

Gaussian Classifier has produced 79.807 Accuracy ~ 80%

	BI-RADS	AGE	SHAPE	MARGIN	DENSITY	SEVERITY
0	5.0	67.0	3.0	5.0	3.0	yes
1	4.0	43.0	1.0	1.0	NaN	yes
2	5.0	58.0	4.0	5.0	3.0	yes
3	4.0	28.0	1.0	1.0	3.0	no
4	5.0	74.0	1.0	5.0	NaN	yes

	AGE	SHAPE	MARGIN	DENSITY	SEVERITY
0	67.0	3.0	5.0	3.0	yes
1	43.0	1.0	1.0	NaN	yes
2	58.0	4.0	5.0	3.0	yes
3	28.0	1.0	1.0	3.0	no
4	74.0	1.0	5.0	NaN	yes

	AGE	SHAPE	MARGIN	DENSITY	yes
0	67.0	3.0	5.0	3.0	1
2	58.0	4.0	5.0	3.0	1
3	28.0	1.0	1.0	3.0	0
8	57.0	1.0	5.0	3.0	1
10	76.0	1.0	4.0	3.0	1

	AGE	SHAPE	MARGIN	DENSITY	SEVERITY
0	67.0	3.0	5.0	3.0	1
2	58.0	4.0	5.0	3.0	1
3	28.0	1.0	1.0	3.0	0
8	57.0	1.0	5.0	3.0	1
10	76.0	1.0	4.0	3.0	1

	AGE	SHAPE	MARGIN	DENSITY	SEVERITY
count	831.000000	831.000000	831.000000	831.000000	831.000000
mean	55.777377	2.783394	2.814681	2.915764	0.484958
std	14.663528	1.242331	1.566771	0.350737	0.500075
min	18.000000	1.000000	1.000000	1.000000	0.000000
25%	46.000000	2.000000	1.000000	3.000000	0.000000
50%	57.000000	3.000000	3.000000	3.000000	0.000000
75%	66.000000	4.000000	4.000000	3.000000	1.000000
max	96.000000	4.000000	5.000000	4.000000	1.000000

	5	67	3	5.1	3.1	yes
955	4	47	2	1	3	no
956	4	56	4	5	3	yes
957	4	64	4	5	3	no
958	5	66	4	5	3	yes
959	4	62	3	3	3	no