โ ์ ํํ๊ท ์ค์ต: ํค - ์ฒด์ค
Inย [ย ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
์ ํํ๊ท ์ค์ตยถ
- ์์ ๋ฐ์ดํฐ ์์ฑ
Inย [ย ]:
heights = [187, 174, 179, 192, 188, 160, 179, 168, 168, 174]
weights = [87, 81, 82, 92, 90, 61, 86, 66, 69, 69]
body_df = pd.DataFrame({'height':heights, 'weight':weights})
body_df.head(3)
Out[ย ]:
height | weight | |
---|---|---|
0 | 187 | 87 |
1 | 174 | 81 |
2 | 179 | 82 |
- ๋ฐ์ดํฐ ์๊ฐํ: ์ฐ์ ๋
Inย [ย ]:
sns.scatterplot(data=body_df, x='weight', y='height')
plt.title('Weight vs Height')
plt.xlabel('weight(kg)')
plt.ylabel('height(cm)')
plt.show()
- ์ ํํ๊ท ํ๋ จ(์ ํฉ)
Inย [ย ]:
from sklearn.linear_model import LinearRegression
#์ ํ ๋ชจ๋ธ์ ๊ฐ์ ธ์ ๋ณ์์ ์ ์ฅ
model_lr = LinearRegression()
# ํ์
ํ์ธํด๋ณด๊ธฐ
type(model_lr)
Out[ย ]:
sklearn.linear_model._base.LinearRegression
Inย [ย ]:
# DataFrame[] : Series(๋ฐ์ดํฐ ํ๋ ์์ ์ปฌ๋ผ)
# DataFrame[[]] : DataFrame ๊ทธ๋๋ก
x = body_df[['weight']]
y = body_df[['height']]
Inย [ย ]:
# ๋ฐ์ดํฐ ํ๋ จ
model_lr.fit(X = x, y = y)
Out[ย ]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
Inย [ย ]:
# ๊ฐ์ค์น(w1) ํ์ธ, ๋ณ์์ ์ง์
print(model_lr.coef_)
w1 = model_lr.coef_[0][0]
# ํธํฅ(bias, w0) ํ์ธ
print(model_lr.intercept_)
w0 = model_lr.intercept_[0]
[[0.86251245]] [109.36527488]
Inย [ย ]:
print('y = {}x + {}'.format(w1.round(2), w0.round(2)))
y = 0.86x + 109.37
๊ฒฐ๋ก : ํค(height, y)๋ ๋ชธ๋ฌด๊ฒ(weight, x)์ 0.86์ ๊ณฑํ ๋ค 109.37์ ๋ํ๋ฉด ๋๋ค.
- ์ ํจ์์์ ํ์ฉํ์ฌ ์์ธก ์ปฌ๋ผ ์ถ๊ฐ
Inย [ย ]:
body_df['pred'] = body_df['weight']*w1 + w0
body_df.head(3)
Out[ย ]:
height | weight | pred | |
---|---|---|---|
0 | 187 | 87 | 184.403858 |
1 | 174 | 81 | 179.228784 |
2 | 179 | 82 | 180.091296 |
- ์๋ฌ ๊ณ์ฐ ํ ์ ๊ณฑ
Inย [ย ]:
body_df['error'] = body_df['height'] - body_df['pred']
body_df['error^2'] = body_df['error']*body_df['error']
body_df.head(3)
Out[ย ]:
height | weight | pred | error | error^2 | |
---|---|---|---|---|---|
0 | 187 | 87 | 184.403858 | 2.596142 | 6.739951 |
1 | 174 | 81 | 179.228784 | -5.228784 | 27.340178 |
2 | 179 | 82 | 180.091296 | -1.091296 | 1.190927 |
- MSE ๊ณ์ฐ (์ ๋ถ ๋ํ๊ธฐ)
Inย [ย ]:
body_df['error^2'].sum() / len(body_df)
Out[ย ]:
10.152939045376309
- ์๊ฐํ
Inย [ย ]:
sns.scatterplot(data = body_df, x ='weight', y = 'height')
sns.lineplot(data = body_df, x = 'weight', y = 'pred', color = 'red')
Out[ย ]:
<Axes: xlabel='weight', ylabel='height'>
- ๋ชจ๋ธ ํ๊ฐ
Inย [ย ]:
from sklearn.metrics import mean_squared_error, r2_score
Inย [ย ]:
# ํ๊ฐํจ์๋ ๊ณตํต์ ์ผ๋ก ์ ๋ต(์ค์ true), ์์ธก๊ฐ(pred)
y_true = body_df['height']
y_pred = body_df['pred']
mean_squared_error(y_true, y_pred)
Out[ย ]:
10.152939045376309
Inย [ย ]:
r2_score(y_true, y_pred)
Out[ย ]:
0.8899887415172141
Inย [ย ]:
y_pred2 = model_lr.predict(body_df[['weight']])
y_pred2
Out[ย ]:
array([[184.40385835], [179.22878362], [180.09129608], [188.71642061], [186.99139571], [161.97853455], [183.54134589], [166.29109682], [168.87863418], [168.87863418]])
Inย [ย ]:
mean_squared_error(y_true, y_pred2)
Out[ย ]:
10.152939045376309
โ ์ ํํ๊ท ์ค์ต: ์ด ๊ฒฐ์ ๊ธ์ก - ํ
Inย [ย ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
C:\Users\LOVE\AppData\Local\Temp\ipykernel_17864\1283938912.py:2: DeprecationWarning: Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), (to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) but was not found to be installed on your system. If this would cause problems for you, please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 import pandas as pd
์ ํํ๊ท ์ค์ต 2ยถ
- seaborn ๋ด์ฅ ๋ฐ์ดํฐ์ ๋ถ๋ฌ์ค๊ธฐ
Inย [ย ]:
tips_df = sns.load_dataset('tips')
tips_df.head(3)
Out[ย ]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
- ๋ฐ์ดํฐ ์๊ฐํ: ์ฐ์ ๋
Inย [ย ]:
sns.scatterplot(data = tips_df, x = 'total_bill', y = 'tip')
plt.title('total bill & tip')
plt.xlabel('total bill')
plt.ylabel('tip')
plt.show()
- ์ ํํ๊ท ํ๋ จ(์ ํฉ)
Inย [ย ]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()
x = tips_df[['total_bill']]
y = tips_df[['tip']]
model_lr.fit(x, y)
Out[ย ]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
Inย [ย ]:
# ๊ฐ์ค์น(w1) ํ์ธ, ๋ณ์์ ์ง์
print(model_lr.coef_)
w1 = model_lr.coef_[0][0]
# ํธํฅ(bias, w0) ํ์ธ
print(model_lr.intercept_)
w0 = model_lr.intercept_[0]
# ์์ ํ์ธ
print('y = {}x + {}'.format(w1.round(2), w0.round(2)))
[[0.10502452]] [0.92026961] y = 0.11x + 0.92
๊ฒฐ๋ก : ์ด๊ฒฐ์ ๊ธ์ก์ด 1$ ์ค๋ฅผ ๋, ํ์ 0.11$ ์ถ๊ฐ๋๋ค.
- ์ ํจ์์์ ํ์ฉํ์ฌ ์์ธก๊ฐ ์ถ๊ฐ
Inย [ย ]:
y_true = tips_df['tip']
y_pred = model_lr.predict(tips_df[['total_bill']])
- ๋ชจ๋ธ ํ๊ฐ
Inย [ย ]:
from sklearn.metrics import mean_squared_error, r2_score
mean_squared_error(y_true, y_pred)
Out[ย ]:
1.036019442011377
Inย [ย ]:
r2_score(y_true, y_pred)
Out[ย ]:
0.45661658635167657
- ์๊ฐํ
Inย [ย ]:
tips_df['pred'] = y_pred
tips_df.head(3)
Out[ย ]:
total_bill | tip | sex | smoker | day | time | size | pred | |
---|---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 2.704636 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 2.006223 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 3.126835 |
Inย [ย ]:
sns.scatterplot(data = tips_df, x ='total_bill', y = 'tip')
sns.lineplot(data = tips_df, x = 'total_bill', y = 'pred', color = 'red')
Out[ย ]:
<Axes: xlabel='total_bill', ylabel='tip'>
์ฑ๋ณ, ํก์ฐ ์ ๋ฌด ๋ฑ ๋ค๋ฅธ ๋ณ์์ ๋ํ ์ถ๊ฐ ๋ถ์ ํ์ โ ๋ค์ค์ ํํ๊ท ๋ถ์ ์งํ
- ๋ฒ์ฃผํ ๋ฐ์ดํฐ ์ฌ์ฉํ๊ธฐ
Inย [ย ]:
def get_sex(x):
if x == 'Female':
return 0
else:
return 1
tips_df['sex_en'] = tips_df['sex'].apply(get_sex)
tips_df.head(3)
Out[ย ]:
total_bill | tip | sex | smoker | day | time | size | pred | sex_en | |
---|---|---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 2.704636 | 0 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 2.006223 | 1 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 3.126835 | 1 |
- ํ๋ จ
Inย [ย ]:
model_lr2 = LinearRegression()
x = tips_df[['total_bill', 'sex_en']]
y = tips_df[['tip']]
model_lr2.fit(x, y)
Out[ย ]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
- ์์ธก
Inย [ย ]:
y_pred2 = model_lr2.predict(x)
- ํ๊ฐ
Inย [ย ]:
# ๋จ์์ ํํ๊ท mse (x = total bill)
print('๋จ์์ ํํ๊ท', mean_squared_error(y_true, y_pred))
# ๋ค์ค์ ํํ๊ท mse (x = total bill, sex)
print('๋ค์ค์ ํํ๊ท', mean_squared_error(y_true, y_pred2))
๋จ์์ ํํ๊ท 1.036019442011377 ๋ค์ค์ ํํ๊ท 1.0358604137213616
Inย [ย ]:
# ๋จ์์ ํํ๊ท mse (x = total bill)
print('๋จ์์ ํํ๊ท', r2_score(y_true, y_pred))
# ๋ค์ค์ ํํ๊ท mse (x = total bill, sex)
print('๋ค์ค์ ํํ๊ท', r2_score(y_true, y_pred2))
๋จ์์ ํํ๊ท 0.45661658635167657 ๋ค์ค์ ํํ๊ท 0.45669999534149963
โ ๋ก์ง์คํฑํ๊ท ์ค์ต: ํ์ดํ๋
Inย [ย ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
๋ก์ง์คํฑ ํ๊ท ์ค์ตยถ
- ๋ฐ์ดํฐ์ ๋ถ๋ฌ์ค๊ธฐ
Inย [ย ]:
titanic_df = pd.read_csv('C:/Users/LOVE/Downloads/vscode/ML/titanic/train.csv', encoding='UTF-8')
titanic_df.head(3)
Out[ย ]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
- ๊ฒฐ์ธก์น ํ์ธ
Inย [ย ]:
titanic_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
- x = Fare
Inย [ย ]:
x1 = titanic_df[['Fare']]
y_true = titanic_df[['Survived']]
from sklearn.linear_model import LogisticRegression
model_lor = LogisticRegression()
model_lor.fit(x1, y_true)
c:\Users\LOVE\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py:1229: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
Out[ย ]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
Inย [ย ]:
sns.scatterplot(titanic_df, x = 'Fare', y = 'Survived')
Out[ย ]:
<Axes: xlabel='Fare', ylabel='Survived'>
Inย [ย ]:
sns.histplot(titanic_df, x = 'Fare')
Out[ย ]:
<Axes: xlabel='Fare', ylabel='Count'>
Inย [ย ]:
# ๊ธฐ์ ํต๊ณ (์์นํ)
titanic_df.describe()
Out[ย ]:
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
Inย [ย ]:
def get_att(x):
print('ํด๋์ค ์ข
๋ฅ', x.classes_)
print('๋
๋ฆฝ๋ณ์ ๊ฐ์', x.n_features_in_)
print('๋ค์ด๊ฐ ๋
๋ฆฝ๋ณ์(x)์ ์ด๋ฆ', x.feature_names_in_)
print('๊ฐ์ค์น', x.coef_)
print('๋ฐ์ด์ด์ค', x.intercept_)
get_att(model_lor)
ํด๋์ค ์ข ๋ฅ [0 1] ๋ ๋ฆฝ๋ณ์ ๊ฐ์ 1 ๋ค์ด๊ฐ ๋ ๋ฆฝ๋ณ์(x)์ ์ด๋ฆ ['Fare'] ๊ฐ์ค์น [[0.01519617]] ๋ฐ์ด์ด์ค [-0.94129222]
Inย [ย ]:
from sklearn.metrics import accuracy_score, f1_score
def get_metrics(true, pred):
print('์ ํ๋', accuracy_score(true, pred))
print('f1-score', f1_score(true, pred))
Inย [ย ]:
y_pred_1 = model_lor.predict(x1)
y_pred_1[:10]
Out[ย ]:
array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)
Inย [ย ]:
get_metrics(y_true, y_pred_1)
์ ํ๋ 0.6655443322109988 f1-score 0.354978354978355
๊ฒฐ๋ก : Fare๊ฐ ์ ์๋ฏธํ ๋ณ์๋ ์๋
- ๋ค์ค๋ก์ง์คํฑํ๊ท
Inย [ย ]:
def get_sex(x):
if x == 'female':
return 0
else:
return 1
titanic_df['Sex_en'] = titanic_df['Sex'].apply(get_sex)
titanic_df.head(3)
Out[ย ]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Sex_en | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 |
Inย [ย ]:
x2 = titanic_df[['Pclass', 'Sex_en', 'Fare']]
y_true = titanic_df[['Survived']]
model_lor2 = LogisticRegression()
model_lor2.fit(x2, y_true)
c:\Users\LOVE\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py:1229: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
Out[ย ]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
Inย [ย ]:
get_att(model_lor2)
ํด๋์ค ์ข ๋ฅ [0 1] ๋ ๋ฆฝ๋ณ์ ๊ฐ์ 3 ๋ค์ด๊ฐ ๋ ๋ฆฝ๋ณ์(x)์ ์ด๋ฆ ['Pclass' 'Sex_en' 'Fare'] ๊ฐ์ค์น [[-8.88331324e-01 -2.53993425e+00 1.64019087e-03]] ๋ฐ์ด์ด์ค [3.02004403]
Inย [ย ]:
y_pred_2 = model_lor2.predict(x2)
y_pred_2[:10]
Out[ย ]:
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1], dtype=int64)
Inย [ย ]:
print('# X: Fare')
get_metrics(y_true, y_pred_1)
print('# X: Fare, Pclass, Sex')
get_metrics(y_true, y_pred_2)
# X: Fare ์ ํ๋ 0.6655443322109988 f1-score 0.354978354978355 # X: Fare, Pclass, Sex ์ ํ๋ 0.7867564534231201 f1-score 0.7121212121212122
Inย [ย ]:
# ๊ฐ ๋ฐ์ดํฐ๋ณ Y=1์ผ ํ๋ฅ (= ์์กดํ ํ๋ฅ )
model_lor2.predict_proba(x2)
Out[ย ]:
array([[0.8977979 , 0.1022021 ], [0.09546762, 0.90453238], [0.40901264, 0.59098736], ..., [0.40287202, 0.59712798], [0.58880217, 0.41119783], [0.89772263, 0.10227737]])
'๐ ํต๊ณํ & ๋จธ์ ๋ฌ๋ ๊ธฐ๋ก' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
240206 TUE ๋จธ์ ๋ฌ๋ ์ฌํ - ํ๊ท, ๋ถ๋ฅ ๋ชจ๋ธ (1) | 2024.02.06 |
---|---|
240205 MON ๋จธ์ ๋ฌ๋ ์ฌํ - ๋ฐ์ดํฐ ๋ถ์ ํ๋ก์ธ์ค (1) | 2024.02.05 |
240201 THU ๋จธ์ ๋ฌ๋ ๊ธฐ์ด - ๋ก์ง์คํฑํ๊ท (0) | 2024.02.01 |
240131 WED ๋จธ์ ๋ฌ๋ ๊ธฐ์ด - ์ ํํ๊ท (0) | 2024.01.31 |
240124 WED ํต๊ณํ ๊ธฐ์ด 4/4 (1) | 2024.01.24 |