机器学习实战案例 - 完整项目流程

前置知识：需要先完成机器学习系列前三篇教程的学习

本文重点：完整项目流程实践，建立工程化思维

一、项目概述

1.1 问题描述

我们将完成一个客户流失预测项目，这是电信行业的经典问题：

目标：预测哪些客户可能流失
数据：客户信息、服务订阅、账单等
价值：提前识别高风险客户，降低流失率

1.2 项目流程

1. 数据获取与理解
2. 数据探索性分析 (EDA)
3. 数据预处理
4. 特征工程
5. 模型选择与训练
6. 模型评估与调优
7. 模型部署与监控

二、数据获取与理解

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 设置
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns', 30)
# 生成模拟数据
np.random.seed(42)
n_samples = 5000
# 生成客户数据
data = {
    'customer_id': range(1, n_samples + 1),
    'credit_score': np.random.randint(300, 850, n_samples),
    'geography': np.random.choice(['France', 'Germany', 'Spain'], n_samples, p=[0.5, 0.25, 0.25]),
    'gender': np.random.choice(['Male', 'Female'], n_samples),
    'age': np.random.randint(18, 92, n_samples),
    'tenure': np.random.randint(0, 11, n_samples),
    'balance': np.random.exponential(50000, n_samples) * np.random.choice([0, 1], n_samples, p=[0.4, 0.6]),
    'num_of_products': np.random.choice([1, 2, 3, 4], n_samples, p=[0.5, 0.45, 0.04, 0.01]),
    'has_cr_card': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]),
    'is_active_member': np.random.choice([0, 1], n_samples, p=[0.5, 0.5]),
    'estimated_salary': np.random.uniform(10000, 200000, n_samples),
}
df = pd.DataFrame(data)
# 生成目标变量（与特征有一定相关性）
churn_prob = (
    0.1 + 
    0.2 * (df['age'] > 50).astype(int) +
    0.15 * (df['geography'] == 'Germany').astype(int) +
    0.1 * (df['is_active_member'] == 0).astype(int) +
    0.1 * (df['num_of_products'] == 1).astype(int) +
    0.15 * (df['balance'] == 0).astype(int) -
    0.1 * (df['tenure'] > 5).astype(int)
)
churn_prob = np.clip(churn_prob, 0.05, 0.8)
df['exited'] = (np.random.random(n_samples) < churn_prob).astype(int)
print("数据概览:")
print(df.info())
print("\n前5行:")
print(df.head())
print("\n目标变量分布:")
print(df['exited'].value_counts(normalize=True))

三、探索性数据分析

# 1. 基础统计
print("=== 基础统计 ===")
print(df.describe())
print("\n类别变量分布:")
for col in ['geography', 'gender']:
    print(f"\n{col}:")
    print(df[col].value_counts())
# 2. 目标变量分布
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 计数
df['exited'].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('流失分布')
axes[0].set_xlabel('是否流失')
axes[0].set_ylabel('客户数')
axes[0].set_xticklabels(['未流失', '流失'], rotation=0)
# 比例
df['exited'].value_counts(normalize=True).plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
axes[1].set_title('流失比例')
axes[1].set_ylabel('')
plt.tight_layout()
plt.savefig('churn_distribution.png', dpi=100, bbox_inches='tight')
plt.close()
# 3. 数值变量分布
numeric_cols = ['credit_score', 'age', 'tenure', 'balance', 'estimated_salary']
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for ax, col in zip(axes.ravel(), numeric_cols):
    df[df['exited'] == 0][col].hist(bins=30, alpha=0.5, label='未流失', ax=ax)
    df[df['exited'] == 1][col].hist(bins=30, alpha=0.5, label='流失', ax=ax)
    ax.set_title(col)
    ax.legend()
axes.ravel()[-1].axis('off')
plt.tight_layout()
plt.savefig('numeric_distribution.png', dpi=100, bbox_inches='tight')
plt.close()
# 4. 类别变量与流失关系
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, col in zip(axes, ['geography', 'gender', 'num_of_products']):
    pd.crosstab(df[col], df['exited'], normalize='index').plot(kind='bar', ax=ax)
    ax.set_title(f'{col}与流失关系')
    ax.set_xlabel(col)
    ax.set_ylabel('流失率')
    ax.legend(['未流失', '流失'])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.savefig('categorical_churn.png', dpi=100, bbox_inches='tight')
plt.close()
# 5. 相关性分析
numeric_df = df.select_dtypes(include=[np.number])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('特征相关性热力图')
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=100, bbox_inches='tight')
plt.close()
# 6. 关键发现
print("\n=== 关键发现 ===")
print("1. 德国客户流失率较高")
print("2. 年龄较大的客户更容易流失")
print("3. 不活跃会员流失风险更高")
print("4. 余额为0的客户流失率较高")

四、数据预处理

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# 1. 数据分割
X = df.drop(['customer_id', 'exited'], axis=1)
y = df['exited']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")
# 2. 定义特征类型
numeric_features = ['credit_score', 'age', 'tenure', 'balance', 'estimated_salary']
categorical_features = ['geography', 'gender']
binary_features = ['has_cr_card', 'is_active_member', 'num_of_products']
# 3. 创建预处理管道
from sklearn.preprocessing import OneHotEncoder
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bin', 'passthrough', binary_features)
    ]
)
# 4. 检查预处理结果
X_train_processed = preprocessor.fit_transform(X_train)
print(f"\n预处理后特征数: {X_train_processed.shape[1]}")

五、模型训练与比较

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)
import time
# 定义模型
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=10, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
}
# 训练和评估
results = []
for name, model in models.items():
    # 创建完整管道
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # 训练
    start_time = time.time()
    pipe.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # 预测
    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]
    
    # 评估
    metrics = {
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_prob),
        'Train Time': train_time
    }
    results.append(metrics)
    
    print(f"\n=== {name} ===")
    print(f"准确率: {metrics['Accuracy']:.4f}")
    print(f"精确率: {metrics['Precision']:.4f}")
    print(f"召回率: {metrics['Recall']:.4f}")
    print(f"F1分数: {metrics['F1']:.4f}")
    print(f"AUC: {metrics['AUC']:.4f}")
# 结果对比
results_df = pd.DataFrame(results)
print("\n=== 模型对比 ===")
print(results_df.set_index('Model'))
# 可视化对比
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 性能指标对比
metrics_cols = ['Accuracy', 'Precision', 'Recall', 'F1', 'AUC']
results_df.set_index('Model')[metrics_cols].plot(kind='bar', ax=axes[0])
axes[0].set_title('模型性能对比')
axes[0].set_ylabel('分数')
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45)
# 训练时间对比
results_df.set_index('Model')['Train Time'].plot(kind='bar', ax=axes[1])
axes[1].set_title('训练时间对比')
axes[1].set_ylabel('时间 (秒)')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45)
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=100, bbox_inches='tight')
plt.close()

六、模型调优

from sklearn.model_selection import GridSearchCV
# 选择最佳模型进行调优 (假设是随机森林)
pipe_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])
# 参数网格
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 15, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}
# 网格搜索
grid_search = GridSearchCV(
    pipe_rf,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
print("=== 开始网格搜索 ===")
grid_search.fit(X_train, y_train)
print(f"\n最优参数: {grid_search.best_params_}")
print(f"最优F1分数: {grid_search.best_score_:.4f}")
# 使用最优模型
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]
print("\n=== 最优模型测试集表现 ===")
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"精确率: {precision_score(y_test, y_pred):.4f}")
print(f"召回率: {recall_score(y_test, y_pred):.4f}")
print(f"F1分数: {f1_score(y_test, y_pred):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_prob):.4f}")
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['未流失', '流失'],
            yticklabels=['未流失', '流失'])
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('混淆矩阵')
plt.savefig('confusion_matrix_final.png', dpi=100, bbox_inches='tight')
plt.close()
# ROC曲线
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_prob):.4f}')
plt.plot([0, 1], [0, 1], 'k--', label='随机')
plt.xlabel('假正率')
plt.ylabel('真正率')
plt.title('ROC曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('roc_curve_final.png', dpi=100, bbox_inches='tight')
plt.close()

七、特征重要性分析

# 获取特征名称
feature_names = (
    numeric_features + 
    list(best_model.named_steps['preprocessor']
         .named_transformers_['cat']
         .named_steps['onehot']
         .get_feature_names_out(categorical_features).tolist()) +
    binary_features
)
# 获取特征重要性
importances = best_model.named_steps['classifier'].feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 8))
plt.barh(feature_importance_df['feature'][:15], feature_importance_df['importance'][:15])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title('Top 15 特征重要性')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance_final.png', dpi=100, bbox_inches='tight')
plt.close()
print("\n=== Top 10 重要特征 ===")
print(feature_importance_df.head(10))

八、模型部署

import joblib
import json
# 保存模型
joblib.dump(best_model, 'churn_model.joblib')
# 保存特征信息
model_info = {
    'model_name': 'Customer Churn Prediction',
    'version': '1.0',
    'features': {
        'numeric': numeric_features,
        'categorical': categorical_features,
        'binary': binary_features
    },
    'performance': {
        'accuracy': float(accuracy_score(y_test, y_pred)),
        'precision': float(precision_score(y_test, y_pred)),
        'recall': float(recall_score(y_test, y_pred)),
        'f1': float(f1_score(y_test, y_pred)),
        'auc': float(roc_auc_score(y_test, y_prob))
    }
}
with open('model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)
# 预测函数
def predict_churn(customer_data, model_path='churn_model.joblib'):
    """预测客户流失概率"""
    model = joblib.load(model_path)
    
    if isinstance(customer_data, dict):
        customer_data = pd.DataFrame([customer_data])
    
    prob = model.predict_proba(customer_data)[:, 1][0]
    pred = model.predict(customer_data)[0]
    
    return {
        'churn_probability': float(prob),
        'prediction': int(pred),
        'risk_level': 'High' if prob > 0.7 else ('Medium' if prob > 0.4 else 'Low')
    }
# 测试预测
sample_customer = {
    'credit_score': 600,
    'geography': 'Germany',
    'gender': 'Male',
    'age': 45,
    'tenure': 3,
    'balance': 50000,
    'num_of_products': 1,
    'has_cr_card': 1,
    'is_active_member': 0,
    'estimated_salary': 100000
}
print("\n=== 示例预测 ===")
result = predict_churn(sample_customer)
print(f"客户信息: {sample_customer}")
print(f"预测结果: {result}")

九、项目总结

完整流程回顾

"""
项目完成清单:
✓ 1. 问题定义：明确业务目标和成功指标
✓ 2. 数据理解：探索数据特征和分布
✓ 3. 数据准备：处理缺失值、编码、标准化
✓ 4. 特征工程：选择和创建有效特征
✓ 5. 模型选择：比较多个算法
✓ 6. 模型调优：优化超参数
✓ 7. 模型评估：在测试集验证性能
✓ 8. 模型解释：分析特征重要性
✓ 9. 模型部署：保存模型和预测接口
关键发现:
- 年龄、地理位置、活跃状态是流失的关键预测因素
- 德国客户流失率显著高于其他国家
- 不活跃会员流失风险更高
业务建议:
- 针对高风险群体制定挽留策略
- 提升会员活跃度的激励机制
- 关注德国市场的客户满意度
"""

参考资源

Kaggle客户流失预测 - 竞赛案例

Scikit-learn案例集 - 官方示例

Machine Learning Mastery - 实战教程

Google ML Crash Course - 快速入门

Full Stack Deep Learning - ML工程实践

Made With ML - 端到端ML教程

ML项目最佳实践 - GCP最佳实践

系列完成：

监督学习算法详解
集成学习方法
无监督学习
模型评估与调优
本文：机器学习实战案例返回：机器学习基础 最后更新: 2026年4月13日

访问 --

搜索文章、标签、项目线索

机器学习实战案例 - 完整项目流程

一、项目概述

1.1 问题描述

1.2 项目流程

二、数据获取与理解

三、探索性数据分析

四、数据预处理

五、模型训练与比较

六、模型调优

七、特征重要性分析

八、模型部署

九、项目总结

完整流程回顾

参考资源

讨论与反馈