Author : tmlab / Date : 2017. 12. 29. 19:14 / Category : Analytics
import os
import pandas as pd
os.chdir("D:\STUDY\kaggle" )
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()
test.head()
train.describe()
def cnt_NA(df):
colname = df.columns.tolist()
for i in colname:
if sum(pd.isnull(df[i])) != 0:
na = sum(pd.isnull(df[i]))
print(i + ":" + str(na)+ ", NA_ratio:" + str(na/len(df)))
print("NA test end")
cnt_NA(train)
train["SalePrice"].describe()
%matplotlib inline
import seaborn as sns
sns.distplot(train['SalePrice'])
cols = train.columns #전체컬럼
num_cols = train._get_numeric_data().columns # 연속형변수
num_cols = list(num_cols)
cate_cols = list(set(cols) - set(num_cols)) #이산형변수
numeric_data = train._get_numeric_data()
numeric_data.hist(bins=50, figsize=(20,15))
cate_data = train[cate_cols]
cate_data.head()
cate_data.describe()
#43개 컬럼
cate_data[cate_cols[0]].value_counts().plot(kind = "bar")
numeric_data.plot.scatter(x=num_cols[4], y='SalePrice', ylim=(0,800000));
from pandas.tools.plotting import scatter_matrix
scatter_matrix(numeric_data, figsize=(12,8))
corr_matrix = numeric_data.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)
import seaborn as sns
plt.subplots(figsize=(12,9))
sns.heatmap(corr_matrix, vmax=0.9, square=True)
num_data = train._get_numeric_data()
cnt_NA(num_data)
# 결측값 얼마안돼는 경우 그냥 drop시킴
num_data = num_data.dropna(subset=["MasVnrArea"])
num_data = num_data.dropna(subset=["GarageYrBlt"])
len(num_data)
# LotFrontage은 중앙값으로 채움
median = num_data["LotFrontage"].median()
num_data["LotFrontage"] = num_data["LotFrontage"].fillna(median)
cnt_NA(num_data)
num_data.info()
from sklearn.model_selection import train_test_split
X = num_data.iloc[:, :-1].values
y = num_data.iloc[:, -1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(len(X_train), len(X_test), len(Y_train), len(Y_test))
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from keras.models import Sequential
from keras.layers import Dense
model = Sequential([
Dense(80, input_dim=37, kernel_initializer='normal', activation='selu'),
Dense(40, kernel_initializer='normal', activation='selu'),
Dense(1, kernel_initializer='normal'),
])
model.summary()
model.compile(loss='mse', optimizer='adam')
history = model.fit(X_train, Y_train, batch_size=3, epochs=100)
plt.plot(history.history["loss"])
plt.title("Loss")
plt.show()
score = model.evaluate(X_test, Y_test, verbose=0)
print(model.metrics_names)
print(score)
predictions = model.predict(X_test)
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, Y_test)))