import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import metrics, svm
from sklearn import utils
import random
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
data=pd.read_csv("mammographic.csv")
data.head()
data.tail()
colnames=['BI-RADS','AGE','SHAPE','MARGIN','DENSITY','SEVERITY']
data1=pd.read_csv("mammographic.csv",names=colnames,header = None)
data1.head()
data1.tail()
na=['?']
df=pd.read_csv('mammographic.csv', na_values=na, names=colnames, header= None)
df.head()
print(df.isnull().sum())
Above gives us the missing values. As we can observe the largest missing values is from the Density Variable and the Variable with no Missing values is 'SEVERITY'
missing_values=df.isnull().sum()
missing_values
df_clean=df.dropna()
df_clean.isnull().sum()
df.isnull().sum()
df2=df.drop(['BI-RADS'], axis=1)
df2.head()
df_clean=df2.dropna()
df_clean.head()
We now have to transform 'SEVERITY' column to numerical data. 1 for 'yes' and 0 for 'no'.
df_clean.head()
df3=pd.get_dummies(df_clean['SEVERITY'])
#df_clean=df2.join(pd.get_dummies(df2.pop('SEVERITY')))
df3.head()
#df2.head()
df3.pop('no')
df3.head()
df_new=pd.concat([df_clean,df3],sort=False,axis=1)
df_new.head()
df_new = df_new.drop(['SEVERITY'], axis=1)
df_new.head()
df_new=df_new.rename(columns={'yes':'SEVERITY'})
df_new.head()
Our new data set has its catagorical variable transformed into a numerical. I have also droped all the null values from the dataset. Now lets see if we need to normalize our data set.
df_new.count()
df_new.describe()
Above is the basic statistics of our dataset. This gives us a quick overview of the shape or the data
sns.set()
_ = plt.hist(df_new['AGE'])
_ = plt.xlabel('Age group')
_ = plt.ylabel('Distribution')
plt.show()
Above is an observation of our age group distribution, the largest age group in our dataset is in the 60 to 70 age group.
sns.set()
_ = plt.hist(df_new['SHAPE'],)
_ = plt.xlabel('SHAPE Group')
_ = plt.ylabel('Distribution')
plt.show()
The 'SHAPE' is a variable ordered from round to irregular. SHAPE { 1: ROUND, 2: OVAL, 3: LOBULAR, 4: IRREGULAR }. As we can observe Irregular 'SHAPE' has the largest distribution on our dataset.
sns.set()
_ = plt.hist(df_new['MARGIN'])
_ = plt.xlabel('MARGIN Group')
_ = plt.ylabel('Distribution')
plt.show()
The above distrubution visualize the attribute 'MARGIN':{1:circumscribed, 2:microlobulated, 3:obscured', 4:ill-defined, 5:spiculated}. From the distribution we also observe that the circumscribed values have the largest distrubutions.
sns.set()
_ = plt.hist(df_new['DENSITY'])
_ = plt.xlabel('DENSITY Group')
_ = plt.ylabel('Distribution')
plt.show()
The above plot describes the distribution of 'DENSITY' on our dataset. DENSITY: {1:high, 2:iso,3:low, 4: fat-containing}. From the distribution the largest category observed on our dataset is 3.0 which is the low Mass Density.
df_new['AGE'].hist(by=df_new['SEVERITY'],bins=20, figsize=(20,7))
The Above figure is the distribution of our dataset AGE group with SEVERITY 1 and 0. We can observe that younger people are more likely to be zero or 'NO' values on SEVERITY than 1/'yes' value.
df_new['MARGIN'].hist(by=df_new['SEVERITY'],bins=20, figsize=(20,7))
df_new['SHAPE'].hist(by=df_new['SEVERITY'],bins=20, figsize=(20,7))
Now we have to standardize our dataset. StandardScaler standardizes a feature by subtracting the mean and then scaling to unit variance. Unit variance means dividing all the values by the standard deviation. StandardScaler results in a distribution with a standard deviation equal to 1.
standardscaler = preprocessing.StandardScaler()
df_scaler= standardscaler.fit_transform(df_new)
scaled_features_df = pd.DataFrame(df_scaler, index=df_new.index, columns=df_new.columns)
Above is the our scaled dataset that has been transformed using StandardScaler.
Now we have to split our dataset into train/test data set. Where 75% for training and 25% for Testing.
y = scaled_features_df['SEVERITY']
attribute_columns=['AGE','SHAPE','MARGIN','DENSITY']
x = scaled_features_df[attribute_columns]
Above we 'y' defines the target variable and split our dataset in feature/attribute and target variable.
#Creating a training and testing vars
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
#Check the shape of train
print(x_train.shape, y_train.shape)
#check the shape of test
print(x_test.shape, y_test.shape)
Below is the creation of Decision Tree Model
#we have to convert our target variable into an int since the classifier expects an int object and not a float
lab_enc = preprocessing.LabelEncoder()
y_train_encoded = lab_enc.fit_transform(y_train)
y_test_encoded = lab_enc.fit_transform(y_test)
#Decision Tree classifier
clf = DecisionTreeClassifier()
#TRain Decision Tree
clf = clf.fit(x_train,y_train_encoded)
#Predict response
y_pred = clf.predict(x_test)
#Evaluate the model
print("Accuracy:", metrics.accuracy_score(y_test_encoded,y_pred))
Decision Tree Classifier has got 70.67% accuracy. Which is a good accuracy.
The following will utilize RandromForest Classifier for prediction
clf = RandomForestClassifier(n_estimators=100)
#Train model
clf.fit(x_train,y_train_encoded)
#predict
y_pred = clf.predict(x_test)
#Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test_encoded,y_pred))
RandomForest classifier has produced 75.48% accuracy
The following will utilize the KNN classifier
#KNN classifier created
knn = KNeighborsClassifier(n_neighbors=5)
#Train Model
knn.fit(x_train,y_train_encoded)
#predict
y_pred=knn.predict(x_test)
#Model accuracy
print("Accuracy:", metrics.accuracy_score(y_test_encoded,y_pred))
KNN has produced 79.32% accuracy.
The following will utilize Naive Bayes classifier.
#Gaussian classifier created
gnb = GaussianNB()
#Model Trained
gnb.fit(x_train,y_train_encoded)
#predict
y_pred=gnb.predict(x_test)
#Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test_encoded,y_pred))
Gaussian Classifier has produced 79.807 Accuracy ~ 80%