[Kaggle] Titanic 데이터 분석
[Kaggle] Titanic 데이터 분석
import pandas as pd train = pd.read_csv('input/train.csv') test = pd.read_csv('input/test.csv') print("=============train==============") print(train.head()) print("=============test==============") print(test.head()) print(train.shape) #891개 데이터, 12개 컬럼 print(test.shape) #418개 데이터, 12개 컬럼 print(train.info()) #데이터 갯수, 타입등 정보 확인 print(test.info()) #데이터 갯수, 타입등 정보 확인 print(train.isnull().sum()) #null갯수 파악 print(test.isnull().sum()) #null갯수 파악 import matplotlib.pyplot as plt import seaborn as sns sns.set() def bar_chart(feature): survived = train[train['Survived']==1][feature].value_counts() dead = train[train['Survived']==0][feature].value_counts() df = pd.DataFrame([survived,dead]) df.index = ['Survived','Dead'] df.plot(kind='bar',stacked=True, figsize=(10,7)) #bar_chart('Sex') #plt.show() #bar_chart('Pclass') #plt.show() #bar_chart('SibSp') #가족이 있을땐 더 살았다. #plt.show() #bar_chart('Parch') #plt.show() #bar_chart('Embarked') #feature == 컬럼 #feature vectors 생성필요. #Name train_test_data = [train, test] # 두개 데이터 합친 셋 만들기 for dataset in train_test_data: dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.', expand=False) print(train['Title'].value_counts()) print(test['Title'].value_counts()) title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3, "Countess":3, "Ms":3, "Lady":3, "Jonkheer":3, "Don":3, "Dona":3, "Mme":3, "Capt":3,"Sir":3} for dataset in train_test_data: dataset['Title'] = dataset['Title'].map(title_mapping) print(train.head()) bar_chart('Title') #plt.show() #불필요한 네임값 삭제 train.drop('Name', axis=1, inplace=True) test.drop('Name', axis=1, inplace=True) #성별 sex_mapping = {"male":0, "female":1} for dataset in train_test_data: dataset['Sex'] = dataset['Sex'].map(sex_mapping) bar_chart('Sex') #plt.show() #나이 #NaN 처리해주기. #어쨋든 채워주기. train.head(100) #Age가 비었을때는 Title별로 Age의 평균? 정보를 써라. train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True) test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True) facet = sns.FacetGrid(train, hue="Survived", aspect=4) facet.map(sns.kdeplot, 'Age', shade=True) facet.set(xlim=(0, train['Age'].max())) facet.add_legend() #plt.xlim(0,20) #limit 0~20살까지 보기 #plt.show() #Binning 데이터를 카테고리별로 담기 이해하기 쉽도록 classify함. #child:0 #young:1 #adult:2 #mid-age:3 #senior:4 for dataset in train_test_data: dataset.loc[dataset['Age'] <= 16, 'Age'] = 0, dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1, dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2, dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3, dataset.loc[(dataset['Age'] > 62), 'Age'] = 4 print(train.head()) bar_chart('Age') #plt.show() #Embarked Pclass1 = train[train['Pclass']==1]['Embarked'].value_counts() Pclass2 = train[train['Pclass']==2]['Embarked'].value_counts() Pclass3 = train[train['Pclass']==3]['Embarked'].value_counts() df = pd.DataFrame([Pclass1, Pclass2, Pclass3]) df.index = ['1st class', '2nd clsdd', '3rd class'] df.plot(kind='bar', stacked=True, figsize=(10,10)) #plt.show() for dataset in train_test_data: dataset['Embarked'] = dataset['Embarked'].fillna('S') print(train.head()) embarked_mapping = {"S":0, "C":1, "Q":2} for dataset in train_test_data: dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping) #Fare train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True) test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True) facet = sns.FacetGrid(train, hue="Survived", aspect=4) facet.map(sns.kdeplot, 'Fare', shade =True) facet.set(xlim=(0, train['Fare'].max())) facet.add_legend() #plt.show() for dataset in train_test_data: dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0, dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1, dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2, dataset.loc[(dataset['Fare'] > 100), 'Fare'] = 3 print(train.head()) #Cabin print(train.Cabin.value_counts()) for dataset in train_test_data: dataset['Cabin'] = dataset['Cabin'].str[:1] #첫번째 char만 Pclass1 = train[train['Pclass']==1]['Cabin'].value_counts() Pclass2 = train[train['Pclass']==2]['Cabin'].value_counts() Pclass3 = train[train['Pclass']==3]['Cabin'].value_counts() df = pd.DataFrame([Pclass1, Pclass2, Pclass3]) df.index = ['1st class', '2nd class', '3rd class'] df.plot(kind='bar', stacked=True, figsize=(10,7)) #plt.show() #Feature Scailing cabin_mapping = {"A":0, "B":0.4, "C": 0.8, "D":1.2, "E":1.6, "F":2, "G":2.4, "T":2.8} for dataset in train_test_data: dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping) train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True) test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True) #FamilySize train["FamilySize"] = train["SibSp"] + train["Parch"] + 1 test["FamilySize"] = test["SibSp"] + test["Parch"] + 1 facet = sns.FacetGrid(train, hue="Survived", aspect=4) facet.map(sns.kdeplot, 'FamilySize', shade=True) facet.set(xlim=(0, train['FamilySize'].max())) facet.add_legend() plt.xlim(0) #plt.show() family_mapping = {1:0, 2:0.4, 3:0.8, 4:1.2, 5:1.6, 6:2, 7:2.4, 8:2.8, 9:3.2, 10:3.6, 11:4} for dataset in train_test_data: dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping) print(train.head()) #중요하지 않은정보 DROP features_drop = ['Ticket', 'SibSp', 'Parch'] train = train.drop(features_drop, axis=1) test = test.drop(features_drop, axis=1) train = train.drop(['PassengerId'], axis=1) train_data = train.drop('Survived', axis=1) target = train['Survived'] train_data.shape, target.shape print(train_data.head(10)) #import Classifier Modules from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC import numpy as np train.info() #Cross Validation(K-fold) from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score k_fold = KFold(n_splits=10, shuffle=True, random_state=0) #kNN clf = KNeighborsClassifier(n_neighbors = 13) scoring = 'accuracy' score = cross_val_score(clf,train_data, target, cv=k_fold, n_jobs=1, scoring=scoring) print(score) #kNN Score print(round(np.mean(score)*100,2)) #Average #Decision Tree clf = DecisionTreeClassifier() scoring = 'accuracy' score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring) print(score) # decision tree Score print(round(np.mean(score)*100, 2)) #Random Forest clf = RandomForestClassifier(n_estimators=13) scoring = 'accuracy' score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring) print(score) print(round(np.mean(score)*100, 2)) #Naive Bayes clf = GaussianNB() scoring = 'accuracy' score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring) print(score) print(round(np.mean(score)*100,2)) #SVM clf = SVC() scoring = 'accuracy' score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring) print(score) print(round(np,mean(score)*100, 2)) #Testing clf = SVC() clf.fit(train_data, target) test_data = test.drop("PasengerId", axis=1).copy() prediction = clf.predict(test_data) submission = pd.DataFrame({ "PassengerId":test["PassengerId"], "Survived":prediction}) submission.to_csv('submission.csv', index=False) #CSV전환 submission = pd.read_csv('submission.csv') submission.head()
유튜버 허민석님의 딥러닝 강좌를 보며 작성한 코드입니다.
최근 댓글