In [1]:
import pandas as pd
import numpy as np
Load the train and test datasets to create two DataFrames¶
In [2]:
train_url = "./data/train.csv"
train = pd.read_csv(train_url)
test_url = "./data/test.csv"
test = pd.read_csv(test_url)
In [3]:
print "train len: {}".format(len(train))
print "test len: {}".format(len(test))
#data example
train.head()
Out[3]:
In [4]:
age_null = train["Age"].isnull()
print "missed age count: {}".format(len(train[age_null]))
print "missed age embarked value: {}".format(len(train[train["Embarked"].isnull()]))
print "missed age fare value: {}".format(len(train[train["Fare"].isnull()]))
print "missed age cabin value: {}".format(len(train[train["Cabin"].isnull()]))
In [5]:
print "For {} passangers fare is 0.".format(len(train[train["Fare"] == 0]))
Who has missed Embarked field?¶
In [6]:
train[train["Embarked"].isnull()]
Out[6]:
Boarding info is missing for 2 first class passengers. Let see, how many passangers from diffrent ports have the first class ticket¶
In [7]:
zero_fare = train["Fare"] == 0
first_class = train["Pclass"] == 1
train[(~zero_fare) & first_class].pivot_table(values='Fare', index='Embarked', aggfunc=[np.size, np.mean, np.min, np.max])
Out[7]:
In [8]:
train[(~zero_fare) & first_class & (train["Fare"] == 5.0)]
Out[8]:
Seems that more likelly, passanges 62 and 830 were boarded in Southampton¶
In [9]:
train.loc[61, "Embarked"] = 'S'
train.loc[829, "Embarked"] = 'S'
Let's look to passangers that have 0 fare¶
In [10]:
train[zero_fare]
Out[10]:
All of them were boarded in Southampton, let's see what we have for Southampton¶
In [11]:
from_southampton = train["Embarked"] == "S"
train[(~zero_fare) & from_southampton].groupby("Pclass")["Fare"].describe()
Out[11]:
While I looked how propage missed fare, I saw that some 3rd class passangers payed much more than others.¶
In [12]:
third_class = train["Pclass"] == 3
train[(train["Fare"] > 69) & third_class]
Out[12]:
This is a big family, seems that is fare for whole family 69.55 / 10 = 6.955. Mean for 3rd passangers from Southampton¶
In [13]:
third_from_s = from_southampton & third_class
train[(~ zero_fare) & third_from_s & (train["Ticket"] != "CA. 2343")].groupby("Pclass")["Fare"].describe()
Out[13]:
Lets fill missed fare data¶
But first, I`ll create new data set by combining train and test datasets in order get more precise mean of "fare"
In [14]:
all_data = train.copy().append(test.copy())
As was discovered with Sage family, same passengers have same ticket number and combined ticket price for all family/group of people.¶
Lets create new column with count of passangers that share same ticket number
In [15]:
all_data["group"] = all_data.groupby("Ticket")["PassengerId"].transform("count")
Now I can calculate ticket price per passenger¶
In [16]:
all_data["ticket_price"] = all_data["Fare"] / all_data["group"]
all_data[all_data["ticket_price"] > 0].groupby("Pclass").ticket_price.describe()
Out[16]:
I noticed some outlier in data of 1st class passengers. Max ticket_price: 128.082300 with mean 34. Intresting¶
In [17]:
all_data[all_data["ticket_price"] > 128]
Out[17]:
According to https://www.encyclopedia-titanica.org/titanic-survivor/thomas-cardeza.html, that was mother and son with 2 their servants. They occupied most expensive cabins
In [18]:
all_data[(all_data["SibSp"] + all_data["Parch"]) + 1 > all_data["group"]].head()
Out[18]:
Seems that not all family members have same/shared ticket number.¶
Let's update "group" column with maximum of current value or parch + sibsp + 1, whatever is bigger
In [19]:
all_data["group"] = all_data.groupby("Ticket")["PassengerId"].transform("count")
all_data["family"] = all_data["SibSp"] + all_data["Parch"] + 1
all_data["group"] = all_data[["family", "group"]].max(axis=1)
Now I can calculate ticket_price for passengers that had originally messed "Fare" value¶
In [20]:
zero_fare = (all_data["Fare"] == 0) | (all_data["Fare"].isnull())
first_class = all_data["Pclass"] == 1
second_class = all_data["Pclass"] == 2
third_class = all_data["Pclass"] == 3
#as all passengers that have missed "fare" values were boarded in Southampton
from_southampton = all_data["Embarked"] == "S"
first_from_s = from_southampton & first_class
second_from_s = from_southampton & second_class
third_from_s = from_southampton & third_class
all_data.loc[zero_fare & first_class, "ticket_price"] = all_data[~zero_fare & first_from_s].ticket_price.mean()
all_data.loc[zero_fare & second_class, "ticket_price"] = all_data[~zero_fare & second_from_s].ticket_price.mean()
all_data.loc[zero_fare & third_class, "ticket_price"] = all_data[~zero_fare & third_from_s].ticket_price.mean()
Time to investigate age data¶
In [21]:
age_null = all_data["Age"].isnull()
all_data[~age_null]["Age"].describe()
Out[21]:
In [22]:
all_data[age_null].groupby("Pclass")["PassengerId"].count()
Out[22]:
In [23]:
all_data[~age_null & third_class].groupby("Sex").Age.describe()
Out[23]:
In [24]:
all_data[~age_null & first_class].groupby("Sex").Age.describe()
Out[24]:
Seems that median of age of the first class is higher than 3rd, also women are younger than men.¶
Lets use this information to fill our missed data
In [25]:
women = all_data["Sex"] == "female"
men = all_data["Sex"] == "male"
all_data.loc[age_null & first_class & women, "Age"] = all_data[~age_null & first_class & women].Age.mean()
all_data.loc[age_null & first_class & men, "Age"] = all_data[~age_null & first_class & men].Age.mean()
all_data.loc[age_null & second_class & women, "Age"] = all_data[~age_null & second_class & women].Age.mean()
all_data.loc[age_null & second_class & men, "Age"] = all_data[~age_null & second_class & men].Age.mean()
all_data.loc[age_null & third_class & women, "Age"] = all_data[~age_null & third_class & women].Age.mean()
all_data.loc[age_null & third_class & men, "Age"] = all_data[~age_null & third_class & men].Age.mean()
And finally propagete calculated age and fare to our origin train and test datasets¶
In [26]:
train_filter = all_data[all_data.PassengerId.isin(train.PassengerId)]
test_filter = all_data[all_data.PassengerId.isin(test.PassengerId)]
train.loc[train["Age"].isnull(), "Age"] = train_filter.Age
test.loc[test["Age"].isnull(), "Age"] = test_filter.Age
train["ticket_price"] = train_filter.ticket_price
test["ticket_price"] = test_filter.ticket_price
train["group"] = train_filter.group
test["group"] = test_filter.group
Prepare data for ml, converting categorical classes to integer representation¶
In [27]:
df_categorical_train = pd.get_dummies(train[["Pclass", "Sex", "Embarked"]])
df_categorical_test = pd.get_dummies(test[["Pclass", "Sex", "Embarked"]])
target_train = train["Survived"].values
In [28]:
#add visual utils
from inspect import getsourcefile
import os.path as path, sys
current_dir = path.dirname(path.dirname(path.abspath(getsourcefile(lambda:0))))
sys.path.insert(0, current_dir[:current_dir.rfind(path.sep)])
import visuals as vs
import metrics as ms
In [29]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
In [30]:
features = ["Age", "group", "ticket_price"]
selected_features_train = pd.concat([train[features], df_categorical_train], axis=1).values
selected_features_test = pd.concat([test[features], df_categorical_test], axis=1).values
In [31]:
# Initialize the three models
clf_A = AdaBoostClassifier()
clf_B = GradientBoostingClassifier()
clf_C = RandomForestClassifier()
# Calculate the number of samples for 1%, 10%, and 100% of the training data
samples_1 = int(len(selected_features_train) * 0.01)
samples_10 = int(len(selected_features_train) * 0.1)
samples_100 = len(selected_features_train)
# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
clf_name = clf.__class__.__name__
results[clf_name] = {}
for i, samples in enumerate([samples_1, samples_10, samples_100]):
results[clf_name][i] = ms.train_predict(clf, samples, selected_features_train, target_train, selected_features_test)
# Run metrics visualization for the three supervised learning models chosen
vs.evaluate(results)
In [32]:
predictions = clf_A.predict(selected_features_test)
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(predictions, PassengerId, columns = ["Survived"])
# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("/home/denys/AdaBoostClassifier.csv", index_label = ["PassengerId"])