Author : tmlab / Date : 2017. 12. 29. 19:18 / Category : Analytics
import pandas as pd
data = pd.read_csv('Automobile_data_.csv')
data.head()
import numpy as np
data = data.replace('?',np.NaN)
data.head()
data.info()
data.describe()
data['price'] = pd.to_numeric(data['price'])
cols = data.columns #전체칼럼명
num_cols = data._get_numeric_data().columns
num_cols = list(num_cols) #연속형변수
num_cols
cate_cols = list(set(cols) - set(num_cols)) #이산형변수
cate_cols
data.hist(bins=50, figsize=(20,15))
cate_data = data[cate_cols]
cate_data.columns
cate_data[cate_cols[2]].value_counts().plot(kind = "bar")
num_data = data._get_numeric_data()
num_data.head()
def cnt_NA(df):
colname = df.columns.tolist()
for i in colname:
if sum(pd.isnull(df[i])) != 0:
na = sum(pd.isnull(df[i]))
print(i + ":" + str(na)+ ", NA_ratio:" + str(na/len(df)))
print("NA test end")
cnt_NA(num_data)
num_data = num_data.dropna(axis=0, how='any')
cnt_NA(num_data)
from sklearn.model_selection import train_test_split
train, test = train_test_split(num_data, test_size=0.2)
print(len(train), len(test))
test.head()
train_x = train.iloc[:,:-1]
train_y = train.iloc[:, -1]
train_x.head()
train_y.head()
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_x, train_y)
lr.coef_
lr.intercept_
test_x = test.iloc[:,:-1]
test_y = test.iloc[:, -1]
y_pred = lr.predict(test_x)
from sklearn.metrics import mean_squared_error
mean_squared_error(test_y, y_pred)
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(train_x, train_y)
y_pred = clf.predict(test_x)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(test_y, y_pred)
mse
np.sqrt(mse)