kaggle房价预测

1
2
3
4
5
6
7
8
9
10
11
import numpy as np 
import pandas as pd
import os
import torch
from torch.utils import data
from torch import nn

train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

train_data.head() # 输出前五行
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub 0 NaN NaN NaN 0 12 2008 WD Normal 250000
1
2
3
4
# 训练集 1460 个数据
print(train_data.shape) # (1460, 81)
# 测试集 1459 个数据
print(test_data.shape) # (1459, 80),少一列最后的SalePrice
1
2
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
all_features
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub Inside 0 0 NaN NaN NaN 0 2 2008 WD Normal
1 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub FR2 0 0 NaN NaN NaN 0 5 2007 WD Normal
2 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub Inside 0 0 NaN NaN NaN 0 9 2008 WD Normal
3 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub Corner 0 0 NaN NaN NaN 0 2 2006 WD Abnorml
4 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub FR2 0 0 NaN NaN NaN 0 12 2008 WD Normal
1454 160 RM 21.0 1936 Pave NaN Reg Lvl AllPub Inside 0 0 NaN NaN NaN 0 6 2006 WD Normal
1455 160 RM 21.0 1894 Pave NaN Reg Lvl AllPub Inside 0 0 NaN NaN NaN 0 4 2006 WD Abnorml
1456 20 RL 160.0 20000 Pave NaN Reg Lvl AllPub Inside 0 0 NaN NaN NaN 0 9 2006 WD Abnorml
1457 85 RL 62.0 10441 Pave NaN Reg Lvl AllPub Inside 0 0 NaN MnPrv Shed 700 7 2006 WD Normal
1458 60 RL 74.0 9627 Pave NaN Reg Lvl AllPub Inside 0 0 NaN NaN NaN 0 11 2006 WD Normal
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
all_features.dtypes 
'''
MSSubClass int64
MSZoning object
LotFrontage float64
LotArea int64
Street object
...
MiscVal int64
MoSold int64
YrSold int64
SaleType object
SaleCondition object
Length: 79, dtype: object
'''
1
2
3
4
5
6
7
8
9
10
11
12
13
# 返回类型不为 object 列的列名,类型为数字的列名
all_features.dtypes[all_features.dtypes != 'object'].index
'''
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
'MoSold', 'YrSold'],
dtype='object')
'''
1
2
3
4
5
6
7
# 将所有数字类特征放在一个共同的尺度上,即标准化
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
lambda x: (x - x.mean()) / (x.std()))
# 标准化数据之后,均值为0,因此将缺失值设为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 0.067320 RL -0.184443 -0.217841 Pave NaN Reg Lvl AllPub Inside -0.285886 -0.063139 NaN NaN NaN -0.089577 -1.551918 0.157619 WD Normal
1 -0.873466 RL 0.458096 -0.072032 Pave NaN Reg Lvl AllPub FR2 -0.285886 -0.063139 NaN NaN NaN -0.089577 -0.446848 -0.602858 WD Normal
2 0.067320 RL -0.055935 0.137173 Pave NaN IR1 Lvl AllPub Inside -0.285886 -0.063139 NaN NaN NaN -0.089577 1.026577 0.157619 WD Normal
3 0.302516 RL -0.398622 -0.078371 Pave NaN IR1 Lvl AllPub Corner -0.285886 -0.063139 NaN NaN NaN -0.089577 -1.551918 -1.363335 WD Abnorml
4 0.067320 RL 0.629439 0.518814 Pave NaN IR1 Lvl AllPub FR2 -0.285886 -0.063139 NaN NaN NaN -0.089577 2.131647 0.157619 WD Normal
1454 2.419286 RM -2.069222 -1.043758 Pave NaN Reg Lvl AllPub Inside -0.285886 -0.063139 NaN NaN NaN -0.089577 -0.078492 -1.363335 WD Normal
1455 2.419286 RM -2.069222 -1.049083 Pave NaN Reg Lvl AllPub Inside -0.285886 -0.063139 NaN NaN NaN -0.089577 -0.815205 -1.363335 WD Abnorml
1456 -0.873466 RL 3.884968 1.246594 Pave NaN Reg Lvl AllPub Inside -0.285886 -0.063139 NaN NaN NaN -0.089577 1.026577 -1.363335 WD Abnorml
1457 0.655311 RL -0.312950 0.034599 Pave NaN Reg Lvl AllPub Inside -0.285886 -0.063139 NaN MnPrv Shed 1.144116 0.289865 -1.363335 WD Normal
1458 0.067320 RL 0.201080 -0.068608 Pave NaN Reg Lvl AllPub Inside -0.285886 -0.063139 NaN NaN NaN -0.089577 1.763290 -1.363335 WD Normal
1
2
3
4
5
6
# pd.get_dummies 利用 pandas 实现one hot encode的方式
# “Dummy_na=True”将“na”(缺失值)视为有效的特征值,并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True)

all_features.shape # (2919, 331)
all_features
MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 SaleType_Oth SaleType_WD SaleType_nan SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial SaleCondition_nan
0 0.067320 -0.184443 -0.217841 0.646073 -0.507197 1.046078 0.896679 0.523038 0.580708 -0.29303 0 1 0 0 0 0 0 1 0 0
1 -0.873466 0.458096 -0.072032 -0.063174 2.187904 0.154737 -0.395536 -0.569893 1.177709 -0.29303 0 1 0 0 0 0 0 1 0 0
2 0.067320 -0.055935 0.137173 0.646073 -0.507197 0.980053 0.848819 0.333448 0.097840 -0.29303 0 1 0 0 0 0 0 1 0 0
3 0.302516 -0.398622 -0.078371 0.646073 -0.507197 -1.859033 -0.682695 -0.569893 -0.494771 -0.29303 0 1 0 1 0 0 0 0 0 0
4 0.067320 0.629439 0.518814 1.355319 -0.507197 0.947040 0.753100 1.381770 0.468770 -0.29303 0 1 0 0 0 0 0 1 0 0
1454 2.419286 -2.069222 -1.043758 -1.481667 1.289537 -0.043338 -0.682695 -0.569893 -0.968860 -0.29303 0 1 0 0 0 0 0 1 0 0
1455 2.419286 -2.069222 -1.049083 -1.481667 -0.507197 -0.043338 -0.682695 -0.569893 -0.415757 -0.29303 0 1 0 1 0 0 0 0 0 0
1456 -0.873466 3.884968 1.246594 -0.772420 1.289537 -0.373465 0.561660 -0.569893 1.717643 -0.29303 0 1 0 1 0 0 0 0 0 0
1457 0.655311 -0.312950 0.034599 -0.772420 -0.507197 0.682939 0.370221 -0.569893 -0.229194 -0.29303 0 1 0 0 0 0 0 1 0 0
1458 0.067320 0.201080 -0.068608 0.646073 -0.507197 0.715952 0.465941 -0.045732 0.694840 -0.29303 0 1 0 0 0 0 0 1 0 0
1
2
3
4
5
6
7
8
# 训练数据个数
n_train = train_data.shape[0] # 1460
# 通过values属性转换为tensor
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(
train_data.SalePrice.values.reshape(-1, 1),
dtype=torch.float32)
1
2
3
4
5
6
7
8
9
10
11
12
13
# 线性模型作为baseline检查错误
loss = nn.MSELoss()
in_features = train_features.shape[1] # 列数
train_features
'''
tensor([[ 0.0673, -0.1844, -0.2178, ..., 1.0000, 0.0000, 0.0000],
[-0.8735, 0.4581, -0.0720, ..., 1.0000, 0.0000, 0.0000],
[ 0.0673, -0.0559, 0.1372, ..., 1.0000, 0.0000, 0.0000],
...,
[ 0.3025, -0.1416, -0.1428, ..., 1.0000, 0.0000, 0.0000],
[-0.8735, -0.0559, -0.0572, ..., 1.0000, 0.0000, 0.0000],
[-0.8735, 0.2439, -0.0293, ..., 1.0000, 0.0000, 0.0000]])
'''
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def get_net():
net = nn.Sequential(nn.Linear(in_features,1))
return net

# 计算相对误差
def log_rmse(net, features, labels):
# 为了在取对数时进一步稳定该值,将小于1的值设置为1
clipped_preds = torch.clamp(net(features), 1, float('inf'))
rmse = torch.sqrt(loss(torch.log(clipped_preds),
torch.log(labels)))
return rmse.item()

def load_array(data_arrays, batch_size, is_train=True):
"""构造一个PyTorch数据迭代器"""
dataset = data.TensorDataset(*data_arrays)
return data.DataLoader(dataset, batch_size, shuffle=is_train)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def train(net, train_features, train_labels, test_features, test_labels,
num_epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [], []
train_iter = load_array((train_features, train_labels), batch_size)
# 这里使用的是Adam优化算法
optimizer = torch.optim.Adam(net.parameters(),
lr = learning_rate,
weight_decay = weight_decay)
for epoch in range(num_epochs):
for X, y in train_iter:
optimizer.zero_grad()
l = loss(net(X), y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# K则交叉验证
def get_k_fold_data(k, i, X, y):
assert k > 1
fold_size = X.shape[0] // k
X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
X_part, y_part = X[idx, :], y[idx]
if j == i:
X_valid, y_valid = X_part, y_part
elif X_train is None:
X_train, y_train = X_part, y_part
else:
X_train = torch.cat([X_train, X_part], 0)
y_train = torch.cat([y_train, y_part], 0)
return X_train, y_train, X_valid, y_valid
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
batch_size):
train_l_sum, valid_l_sum = 0, 0
for i in range(k):
data = get_k_fold_data(k, i, X_train, y_train)
net = get_net()
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
weight_decay, batch_size)
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]

print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
f'验证log rmse{float(valid_ls[-1]):f}')
return train_l_sum / k, valid_l_sum / k
1
2
3
4
5
6
7
8
9
10
11
12
13
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
f'平均验证log rmse: {float(valid_l):f}')
'''
折1,训练log rmse0.170228, 验证log rmse0.156313
折2,训练log rmse0.162502, 验证log rmse0.192678
折3,训练log rmse0.163904, 验证log rmse0.168219
折4,训练log rmse0.167658, 验证log rmse0.154873
折5,训练log rmse0.162894, 验证log rmse0.182875
5-折验证: 平均训练log rmse: 0.165437, 平均验证log rmse: 0.170992
'''
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def train_and_pred(train_features, test_features, train_labels, test_data,
num_epochs, lr, weight_decay, batch_size):
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None, None,
num_epochs, lr, weight_decay, batch_size)
print(f'训练log rmse:{float(train_ls[-1]):f}')
# 将网络应用于测试集。
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)


train_and_pred(train_features, test_features, train_labels, test_data,
num_epochs, lr, weight_decay, batch_size)
'''
训练log rmse:0.162470
'''
1
pd.read_csv('submission.csv').head()
Id SalePrice
1461 119559.195
1462 154014.53
1463 198652.77
1464 217135.89
1465 177476.7

最终 score 为 : 0.16706