[Kaggle] Titanic 데이터 분석

[Kaggle] Titanic 데이터 분석

import pandas as pd
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
print("=============train==============")
print(train.head())
print("=============test==============")
print(test.head())

print(train.shape) #891개 데이터, 12개 컬럼
print(test.shape) #418개 데이터, 12개 컬럼

print(train.info()) #데이터 갯수, 타입등 정보 확인
print(test.info()) #데이터 갯수, 타입등 정보 확인

print(train.isnull().sum()) #null갯수 파악
print(test.isnull().sum()) #null갯수 파악

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

def bar_chart(feature):
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind='bar',stacked=True, figsize=(10,7))
#bar_chart('Sex')
#plt.show()
#bar_chart('Pclass')
#plt.show()
#bar_chart('SibSp') #가족이 있을땐 더 살았다.
#plt.show()
#bar_chart('Parch')
#plt.show()
#bar_chart('Embarked')

#feature == 컬럼
#feature vectors 생성필요.

#Name
train_test_data = [train, test] # 두개 데이터 합친 셋 만들기
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.', expand=False)

print(train['Title'].value_counts())
print(test['Title'].value_counts())

title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3,
                 "Major": 3, "Mlle": 3, "Countess":3, "Ms":3, "Lady":3, "Jonkheer":3, "Don":3, "Dona":3, "Mme":3, "Capt":3,"Sir":3}
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)

print(train.head())
bar_chart('Title')
#plt.show()

#불필요한 네임값 삭제
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

#성별
sex_mapping = {"male":0, "female":1}
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)
bar_chart('Sex')
#plt.show()

#나이
#NaN 처리해주기.
#어쨋든 채워주기.
train.head(100)
#Age가 비었을때는 Title별로 Age의 평균? 정보를 써라.
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)


facet = sns.FacetGrid(train, hue="Survived", aspect=4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()
#plt.xlim(0,20) #limit 0~20살까지 보기
#plt.show()

#Binning 데이터를 카테고리별로 담기 이해하기 쉽도록 classify함.
#child:0
#young:1
#adult:2
#mid-age:3
#senior:4
for dataset in train_test_data:
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0,
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1,
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2,
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3,
    dataset.loc[(dataset['Age'] > 62), 'Age'] = 4

print(train.head())
bar_chart('Age')
#plt.show()

#Embarked
Pclass1 = train[train['Pclass']==1]['Embarked'].value_counts()
Pclass2 = train[train['Pclass']==2]['Embarked'].value_counts()
Pclass3 = train[train['Pclass']==3]['Embarked'].value_counts()
df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ['1st class', '2nd clsdd', '3rd class']
df.plot(kind='bar', stacked=True, figsize=(10,10))
#plt.show()

for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
print(train.head())

embarked_mapping = {"S":0, "C":1, "Q":2}
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

#Fare
train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

facet = sns.FacetGrid(train, hue="Survived", aspect=4)
facet.map(sns.kdeplot, 'Fare', shade =True)
facet.set(xlim=(0, train['Fare'].max()))
facet.add_legend()
#plt.show()

for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0,
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
    dataset.loc[(dataset['Fare'] > 100), 'Fare'] = 3

print(train.head())


#Cabin
print(train.Cabin.value_counts())


for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1] #첫번째 char만
Pclass1 = train[train['Pclass']==1]['Cabin'].value_counts()
Pclass2 = train[train['Pclass']==2]['Cabin'].value_counts()
Pclass3 = train[train['Pclass']==3]['Cabin'].value_counts()
df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ['1st class', '2nd class', '3rd class']
df.plot(kind='bar', stacked=True, figsize=(10,7))
#plt.show()


#Feature Scailing
cabin_mapping = {"A":0, "B":0.4, "C": 0.8, "D":1.2, "E":1.6, "F":2, "G":2.4, "T":2.8}
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)


#FamilySize
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

facet = sns.FacetGrid(train, hue="Survived", aspect=4)
facet.map(sns.kdeplot, 'FamilySize', shade=True)
facet.set(xlim=(0, train['FamilySize'].max()))
facet.add_legend()
plt.xlim(0)
#plt.show()

family_mapping = {1:0, 2:0.4, 3:0.8, 4:1.2, 5:1.6, 6:2, 7:2.4, 8:2.8, 9:3.2, 10:3.6, 11:4}
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)
print(train.head())


#중요하지 않은정보 DROP
features_drop = ['Ticket', 'SibSp', 'Parch']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)
train = train.drop(['PassengerId'], axis=1)

train_data = train.drop('Survived', axis=1)
target = train['Survived']
train_data.shape, target.shape

print(train_data.head(10))

#import Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import numpy as np

train.info()


#Cross Validation(K-fold)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)


#kNN
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf,train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
#kNN Score
print(round(np.mean(score)*100,2)) #Average

#Decision Tree
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
# decision tree Score
print(round(np.mean(score)*100, 2))

#Random Forest
clf = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print(round(np.mean(score)*100, 2))
      
#Naive Bayes
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print(round(np.mean(score)*100,2))

#SVM
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print(round(np,mean(score)*100, 2))

#Testing
clf = SVC()
clf.fit(train_data, target)
test_data = test.drop("PasengerId", axis=1).copy() 
prediction = clf.predict(test_data)

submission = pd.DataFrame({
    "PassengerId":test["PassengerId"],
    "Survived":prediction})

submission.to_csv('submission.csv', index=False) #CSV전환
submission = pd.read_csv('submission.csv')
submission.head()

유튜버 허민석님의 딥러닝 강좌를 보며 작성한 코드입니다.

You may also like...

답글 남기기

이메일은 공개되지 않습니다. 필수 입력창은 * 로 표시되어 있습니다.