ํ์ดํ๋ ์ข ํฉ ์ค์ตยถ
Inย [ย ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
1. ๋ฐ์ดํฐ ๋ก๋ (& ๋ถ๋ฆฌ)ยถ
Inย [ย ]:
train_df = pd.read_csv('C:/Users/LOVE/Downloads/vscode/ML/titanic/train.csv')
test_df = pd.read_csv('C:/Users/LOVE/Downloads/vscode/ML/titanic/test.csv')
2. EDA (๊ฐ๋จ, ์๋ต)ยถ
Inย [ย ]:
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
Inย [ย ]:
train_df.describe(include = 'all')
Out[ย ]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 891 | 891 | 714.000000 | 891.000000 | 891.000000 | 891 | 891.000000 | 204 | 889 |
unique | NaN | NaN | NaN | 891 | 2 | NaN | NaN | NaN | 681 | NaN | 147 | 3 |
top | NaN | NaN | NaN | Braund, Mr. Owen Harris | male | NaN | NaN | NaN | 347082 | NaN | B96 B98 | S |
freq | NaN | NaN | NaN | 1 | 577 | NaN | NaN | NaN | 7 | NaN | 4 | 644 |
mean | 446.000000 | 0.383838 | 2.308642 | NaN | NaN | 29.699118 | 0.523008 | 0.381594 | NaN | 32.204208 | NaN | NaN |
std | 257.353842 | 0.486592 | 0.836071 | NaN | NaN | 14.526497 | 1.102743 | 0.806057 | NaN | 49.693429 | NaN | NaN |
min | 1.000000 | 0.000000 | 1.000000 | NaN | NaN | 0.420000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | NaN |
25% | 223.500000 | 0.000000 | 2.000000 | NaN | NaN | 20.125000 | 0.000000 | 0.000000 | NaN | 7.910400 | NaN | NaN |
50% | 446.000000 | 0.000000 | 3.000000 | NaN | NaN | 28.000000 | 0.000000 | 0.000000 | NaN | 14.454200 | NaN | NaN |
75% | 668.500000 | 1.000000 | 3.000000 | NaN | NaN | 38.000000 | 1.000000 | 0.000000 | NaN | 31.000000 | NaN | NaN |
max | 891.000000 | 1.000000 | 3.000000 | NaN | NaN | 80.000000 | 8.000000 | 6.000000 | NaN | 512.329200 | NaN | NaN |
3. ์ ์ฒ๋ฆฌยถ
Inย [ย ]:
# ๊ธฐ์ด ๊ฐ๊ณต: Family ๋ณ์ ์์ฑ
train_df_2 = train_df.copy()
def get_family(df):
df['Family'] = df['SibSp'] + df['Parch'] + 1
return df
get_family(train_df_2).head(5)
Out[ย ]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Family | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 2 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 2 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 |
Inย [ย ]:
# ์ซ์ํ ๋ณ์๋ค์ ์ด์์น ํ์ธ
sns.pairplot(train_df_2[['Age', 'Fare', 'Family']])
Out[ย ]:
<seaborn.axisgrid.PairGrid at 0x23a6d238550>
Inย [ย ]:
# Fare ๋ณ์ ์ค ์ด์์น ์ ๊ฑฐ
train_df_2 = train_df_2[train_df_2['Fare'] < 512]
train_df_2.shape
Out[ย ]:
(888, 13)
Inย [ย ]:
train_df_2[['Fare']].describe()
Out[ย ]:
Fare | |
---|---|
count | 888.000000 |
mean | 30.582164 |
std | 41.176366 |
min | 0.000000 |
25% | 7.895800 |
50% | 14.454200 |
75% | 30.771850 |
max | 263.000000 |
Inย [ย ]:
# ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
def get_non_missing(df):
Age_mean = train_df_2['Age'].mean()
Fare_mean = train_df_2['Fare'].mean()
df['Age'] = df['Age'].fillna(Age_mean)
#train ๋ฐ์ดํฐ์๋ ํ์ํ์ง ์์ง๋ง, test ๋ฐ์ดํฐ์ ์กด์ฌํ๋ ๊ฒฐ์ธก์น ์ ๊ฑฐ๋ฅผ ์ํด ์ถ๊ฐ
df['Fare'] = df['Fare'].fillna(Fare_mean)
df['Embarked'] = df['Embarked'].fillna('S')
return df
get_non_missing(train_df_2).info()
<class 'pandas.core.frame.DataFrame'> Index: 888 entries, 0 to 890 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 888 non-null int64 1 Survived 888 non-null int64 2 Pclass 888 non-null int64 3 Name 888 non-null object 4 Sex 888 non-null object 5 Age 888 non-null float64 6 SibSp 888 non-null int64 7 Parch 888 non-null int64 8 Ticket 888 non-null object 9 Fare 888 non-null float64 10 Cabin 202 non-null object 11 Embarked 888 non-null object 12 Family 888 non-null int64 dtypes: float64(2), int64(6), object(5) memory usage: 97.1+ KB
Inย [ย ]:
# ์์นํ
def get_numeric_sc(df):
# stnd: Fare, mnmx: Age, Family
from sklearn.preprocessing import StandardScaler, MinMaxScaler
stnd = StandardScaler()
mnmx = MinMaxScaler()
stnd.fit(train_df_2[['Fare']])
df['Fare_stnd_sc'] = stnd.transform(df[['Fare']])
mnmx.fit(train_df_2[['Age', 'Family']])
df[['Age_mnmx_sc', 'Family_mnmx_sc']] = mnmx.transform(df[['Age', 'Family']])
return df
get_numeric_sc(train_df_2).describe()
Out[ย ]:
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | Family | Fare_stnd_sc | Age_mnmx_sc | Family_mnmx_sc | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 888.000000 | 888.000000 | 888.000000 | 888.000000 | 888.000000 | 888.000000 | 888.000000 | 888.000000 | 8.880000e+02 | 888.000000 | 888.000000 |
mean | 445.618243 | 0.381757 | 2.313063 | 29.675345 | 0.524775 | 0.381757 | 30.582164 | 1.906532 | 6.801366e-17 | 0.367622 | 0.090653 |
std | 257.405474 | 0.486091 | 0.834007 | 13.019816 | 1.104186 | 0.806949 | 41.176366 | 1.615609 | 1.000564e+00 | 0.163607 | 0.161561 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | -7.431301e-01 | 0.000000 | 0.000000 |
25% | 222.750000 | 0.000000 | 2.000000 | 22.000000 | 0.000000 | 0.000000 | 7.895800 | 1.000000 | -5.512664e-01 | 0.271174 | 0.000000 |
50% | 445.500000 | 0.000000 | 3.000000 | 29.675345 | 0.000000 | 0.000000 | 14.454200 | 1.000000 | -3.919008e-01 | 0.367622 | 0.000000 |
75% | 667.250000 | 1.000000 | 3.000000 | 35.000000 | 1.000000 | 0.000000 | 30.771850 | 2.000000 | 4.609266e-03 | 0.434531 | 0.100000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 263.000000 | 11.000000 | 5.647628e+00 | 1.000000 | 1.000000 |
Inย [ย ]:
# ๋ฒ์ฃผํ
def get_category(df):
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
le2 = LabelEncoder()
oe = OneHotEncoder()
le.fit(train_df_2[['Pclass']])
df['Pclass_le'] = le.transform(df['Pclass'])
le2.fit(train_df_2[['Sex']])
df['Sex_le'] = le2.transform(df['Sex'])
#index reset
df = df.reset_index()
oe.fit(train_df_2[['Embarked']])
embarked_csr = oe.transform(df[['Embarked']])
embarked_csr_df = pd.DataFrame(embarked_csr.toarray(), columns = oe.get_feature_names_out())
df = pd.concat([df, embarked_csr_df], axis = 1)
return df
train_df_2 = get_category(train_df_2)
c:\Users\LOVE\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\preprocessing\_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) c:\Users\LOVE\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\preprocessing\_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
4. ๋ชจ๋ธ ์๋ฆฝยถ
Inย [ย ]:
def get_model(df):
from sklearn.linear_model import LogisticRegression
lor = LogisticRegression()
X = df[['Age_mnmx_sc', 'Fare_stnd_sc', 'Family_mnmx_sc', 'Pclass_le', 'Sex_le', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
y = df[['Survived']]
return lor.fit(X, y)
model_output = get_model(train_df_2)
model_output
c:\Users\LOVE\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py:1229: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
Out[ย ]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
Inย [ย ]:
X = train_df_2[['Age_mnmx_sc', 'Fare_stnd_sc', 'Family_mnmx_sc', 'Pclass_le', 'Sex_le', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
y_pred = model_output.predict(X)
5. ํ๊ฐยถ
Inย [ย ]:
from sklearn.metrics import accuracy_score, f1_score
print(accuracy_score(train_df_2['Survived'], y_pred))
print(f1_score(train_df_2['Survived'], y_pred))
0.8029279279279279 0.7320061255742726
6. test ๋ฐ์ดํฐ๋ก ์ ์ฉํ๊ธฐยถ
Inย [ย ]:
test_df.head(3)
Out[ย ]:
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
Inย [ย ]:
test_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 418 non-null int64 1 Pclass 418 non-null int64 2 Name 418 non-null object 3 Sex 418 non-null object 4 Age 332 non-null float64 5 SibSp 418 non-null int64 6 Parch 418 non-null int64 7 Ticket 418 non-null object 8 Fare 417 non-null float64 9 Cabin 91 non-null object 10 Embarked 418 non-null object dtypes: float64(2), int64(4), object(5) memory usage: 36.0+ KB
Inย [ย ]:
test_df_2 = get_family(test_df)
test_df_2 = get_non_missing(test_df_2)
test_df_2 = get_numeric_sc(test_df_2)
test_df_2 = get_category(test_df_2)
c:\Users\LOVE\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\preprocessing\_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) c:\Users\LOVE\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\preprocessing\_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) c:\Users\LOVE\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\preprocessing\_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
Inย [ย ]:
test_X = test_df_2[['Age_mnmx_sc', 'Fare_stnd_sc', 'Family_mnmx_sc', 'Pclass_le', 'Sex_le', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
test_X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age_mnmx_sc 418 non-null float64 1 Fare_stnd_sc 418 non-null float64 2 Family_mnmx_sc 418 non-null float64 3 Pclass_le 418 non-null int64 4 Sex_le 418 non-null int32 5 Embarked_C 418 non-null float64 6 Embarked_Q 418 non-null float64 7 Embarked_S 418 non-null float64 dtypes: float64(6), int32(1), int64(1) memory usage: 24.6 KB
Inย [ย ]:
y_test_pred = model_output.predict(test_X)
Inย [ย ]:
sub_df = pd.read_csv('C:/Users/LOVE/Downloads/vscode/ML/titanic/gender_submission.csv')
sub_df.head(10)
Out[ย ]:
PassengerId | Survived | |
---|---|---|
0 | 892 | 0 |
1 | 893 | 1 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 1 |
5 | 897 | 0 |
6 | 898 | 1 |
7 | 899 | 0 |
8 | 900 | 1 |
9 | 901 | 0 |
Inย [ย ]:
sub_df['Survived'] = y_test_pred
sub_df.head(10)
Out[ย ]:
PassengerId | Survived | |
---|---|---|
0 | 892 | 0 |
1 | 893 | 0 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 1 |
5 | 897 | 0 |
6 | 898 | 1 |
7 | 899 | 0 |
8 | 900 | 1 |
9 | 901 | 0 |
Inย [ย ]:
sub_df.to_csv('./08 result.csv', index = False)
'๐ Python' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
ํ์ด์ฌ ํ๊ท ๋ถ๋ฅ ๋ชจ๋ธ๋ง ์ค์ต (0) | 2024.02.06 |
---|---|
ํ์ด์ฌ ํ ์คํธ ๋ฐ์ดํฐ ๋ถ๋ฆฌ ์ค์ต (0) | 2024.02.05 |
ํ์ด์ฌ ์ค์ผ์ผ๋ง ์ค์ต (0) | 2024.02.05 |
ํ์ด์ฌ ์ธ์ฝ๋ฉ ์ค์ต (0) | 2024.02.05 |
ํ์ด์ฌ ๊ฒฐ์ธก์น ํ์ธ ์ค์ต (0) | 2024.02.05 |