实战项目：房价预测

用 Scikit-learn 走通完整的机器学习流程，从数据清洗到模型调优。

数据集

使用 Kaggle 的房价预测竞赛数据，80 个特征，预测房屋售价：

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"训练集: {train.shape}")
print(f"测试集: {test.shape}")

第一步：探索性数据分析（EDA）

目标变量分布

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(train['SalePrice'], bins=50)
plt.title('SalePrice Distribution')

# SalePrice 通常右偏，取对数使其更接近正态
plt.subplot(1, 2, 2)
sns.histplot(np.log1p(train['SalePrice']), bins=50)
plt.title('log(SalePrice) Distribution')

缺失值分析

# 查看缺失比例
missing = train.isnull().sum().sort_values(ascending=False)
missing_pct = missing[missing > 0] / len(train) * 100
print(missing_pct.head(20))

# 规则：
# - 缺失 > 50% → 删除该特征
# - 缺失 < 5% → 填充（数值用中位数，类别用众数）
# - 缺失在 5-50% → 创建"是否缺失"的指示特征

第二步：数据清洗

def clean_data(df, is_train=True):
    data = df.copy()

    # ---- 处理缺失值 ----
    # 数值特征：填中位数
    num_cols = data.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        if data[col].isnull().sum() > 0:
            data[col] = data[col].fillna(data[col].median())

    # 类别特征：填众数
    cat_cols = data.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if data[col].isnull().sum() > 0:
            data[col] = data[col].fillna(data[col].mode()[0])

    # ---- 目标变量取对数 ----
    if is_train:
        data['SalePrice'] = np.log1p(data['SalePrice'])

    return data

train_clean = clean_data(train, is_train=True)
test_clean = clean_data(test, is_train=False)

第三步：特征工程

# 合并训练和测试集做统一编码
all_data = pd.concat([train_clean, test_clean], sort=False)

# 类别特征做 One-Hot / Label 编码
# 高基数特征（如 Neighborhood）用 Target Encoding
from sklearn.preprocessing import LabelEncoder

cat_cols = all_data.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))

# ---- 构造新特征 ----
# 总面积
all_data['TotalSF'] = (
    all_data['TotalBsmtSF'] +
    all_data['1stFlrSF'] +
    all_data['2ndFlrSF']
)

# 房屋年龄（售出年份 - 建造年份）
all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']

# 是否有地下室
all_data['HasBsmt'] = (all_data['TotalBsmtSF'] > 0).astype(int)

# ---- 分离回训练/测试 ----
feat_cols = [c for c in all_data.columns
             if c not in ['Id', 'SalePrice']]
X = all_data[:len(train_clean)][feat_cols]
y = train_clean['SalePrice']
X_test = all_data[len(train_clean):][feat_cols]

print(f"特征维度: {X.shape}")

第四步：建模

基线模型

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score

def rmse_cv(model, X, y, n_folds=5):
    """交叉验证的 RMSE"""
    scores = np.sqrt(-cross_val_score(
        model, X, y,
        scoring='neg_mean_squared_error',
        cv=n_folds, n_jobs=-1,
    ))
    return scores.mean(), scores.std()

# 基线：线性回归
lr = LinearRegression()
mean, std = rmse_cv(lr, X, y)
print(f"LR RMSE: {mean:.5f} (±{std:.5f})")

# Ridge 回归
ridge = Ridge(alpha=10)
mean, std = rmse_cv(ridge, X, y)
print(f"Ridge RMSE: {mean:.5f} (±{std:.5f})")

集成模型

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# 随机森林
rf = RandomForestRegressor(
    n_estimators=300, max_depth=15,
    min_samples_split=5, n_jobs=-1, random_state=42,
)
mean, std = rmse_cv(rf, X, y)
print(f"RF RMSE: {mean:.5f} (±{std:.5f})")

# Gradient Boosting
gbr = GradientBoostingRegressor(
    n_estimators=500, learning_rate=0.05,
    max_depth=4, min_samples_split=5, random_state=42,
)
mean, std = rmse_cv(gbr, X, y)
print(f"GBR RMSE: {mean:.5f} (±{std:.5f})")

第五步：参数调优

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(200, 600),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
}

rf = RandomForestRegressor(n_jobs=-1, random_state=42)
search = RandomizedSearchCV(
    rf, param_dist, n_iter=30,
    cv=5, scoring='neg_mean_squared_error',
    n_jobs=-1, random_state=42,
)
search.fit(X, y)

print(f"最佳参数: {search.best_params_}")
print(f"最佳 RMSE: {np.sqrt(-search.best_score_):.5f}")

best_model = search.best_estimator_

第六步：特征重要性分析

importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]

# 前 15 个最重要的特征
for i in range(15):
    print(f"{i+1}. {feat_cols[indices[i]]}: {importances[indices[i]]:.4f}")

# 可视化
plt.figure(figsize=(10, 8))
plt.barh(range(15), importances[indices[:15]][::-1])
plt.yticks(range(15), [feat_cols[i] for i in indices[:15]][::-1])
plt.xlabel('Feature Importance')

第七步：模型融合

将多个模型的预测做加权平均，通常能获得最后一点提升：

from sklearn.ensemble import VotingRegressor

ensemble = VotingRegressor([
    ('ridge', Ridge(alpha=10)),
    ('rf', RandomForestRegressor(n_estimators=300, max_depth=12)),
    ('gbr', GradientBoostingRegressor(n_estimators=500, learning_rate=0.05)),
])
mean, std = rmse_cv(ensemble, X, y)
print(f"Ensemble RMSE: {mean:.5f} (±{std:.5f})")

总结

一个完整的 ML 项目流程：

EDA → 数据清洗 → 特征工程 → 基线模型 → 集成模型 → 调优 → 模型融合

几点经验：

不要跳过 EDA — 理解数据比调参更重要
先搭基线（线性回归/逻辑回归），确保 pipeline 跑通再上复杂模型
特征工程决定上限 — 算法的提升空间往往不如好的特征
交叉验证比单次划分可靠 — 避免意外过拟合
模型融合是锦上添花 — 单模型调好之后再考虑

数据集​

第一步：探索性数据分析（EDA）​

目标变量分布​

缺失值分析​

相关性分析​

第二步：数据清洗​

第三步：特征工程​

第四步：建模​

基线模型​

集成模型​

第五步：参数调优​

第六步：特征重要性分析​

第七步：模型融合​

总结​

数据集

第一步：探索性数据分析（EDA）

目标变量分布

缺失值分析

相关性分析

第二步：数据清洗

第三步：特征工程

第四步：建模

基线模型

集成模型

第五步：参数调优

第六步：特征重要性分析

第七步：模型融合

总结