Python数据科学实战：从数据清洗到机器学习模型部署全流程

一、数据科学工作流技术栈

完整数据科学项目核心工具：

Pandas：数据处理与分析
NumPy：数值计算基础
Matplotlib/Seaborn：数据可视化
Scikit-learn：机器学习建模
Flask：模型API部署

二、数据准备与清洗

1. 数据加载与探索

import pandas as pd
import numpy as np

# 加载数据集
df = pd.read_csv('sales_data.csv')

# 数据概览
print(f"数据集形状: {df.shape}")
print("n前5行数据:")
print(df.head())

# 统计摘要
print("n数值列统计摘要:")
print(df.describe())

# 缺失值检查
print("n缺失值统计:")
print(df.isnull().sum())

2. 数据清洗转换

# 处理缺失值
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Income'] = df['Income'].fillna(df.groupby('Education')['Income'].transform('mean'))

# 日期转换
df['Purchase_Date'] = pd.to_datetime(df['Purchase_Date'])
df['Purchase_Month'] = df['Purchase_Date'].dt.month

# 分类变量编码
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df = pd.get_dummies(df, columns=['Education'], prefix='Edu')

# 异常值处理
Q1 = df['Purchase_Amount'].quantile(0.25)
Q3 = df['Purchase_Amount'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Purchase_Amount']  (Q3 + 1.5 * IQR))]

三、探索性数据分析

1. 可视化分析

import matplotlib.pyplot as plt
import seaborn as sns

# 设置样式
sns.set(style="whitegrid")

# 数值分布分析
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['Age'], bins=20, kde=True)
plt.title('年龄分布')

plt.subplot(1, 2, 2)
sns.boxplot(x='Gender', y='Purchase_Amount', data=df)
plt.title('性别与消费金额')

plt.tight_layout()
plt.show()

# 相关性热力图
plt.figure(figsize=(10, 8))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('变量相关性热力图')
plt.show()

四、机器学习建模

1. 特征工程与模型训练

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 准备特征和目标变量
X = df.drop(['CustomerID', 'Purchase_Date', 'Target'], axis=1)
y = df['Target']

# 数据集拆分
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# 创建建模管道
model = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42))
])

# 模型训练
model.fit(X_train, y_train)

# 模型评估
y_pred = model.predict(X_test)
print("模型准确率:", accuracy_score(y_test, y_pred))
print("n分类报告:")
print(classification_report(y_test, y_pred))

2. 模型优化与解释

from sklearn.model_selection import GridSearchCV
import shap

# 超参数调优
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("最佳参数:", grid_search.best_params_)
print("最佳分数:", grid_search.best_score_)

# SHAP值解释
explainer = shap.TreeExplainer(grid_search.best_estimator_.named_steps['classifier'])
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values[1], X_test, plot_type="bar")

五、模型部署与应用

1. Flask API开发

from flask import Flask, request, jsonify
import pickle
import pandas as pd

# 保存模型
with open('model.pkl', 'wb') as f:
    pickle.dump(grid_search.best_estimator_, f)

# 创建Flask应用
app = Flask(__name__)

# 加载模型
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        # 获取请求数据
        data = request.get_json()
        
        # 转换为DataFrame
        input_data = pd.DataFrame([data])
        
        # 预测
        prediction = model.predict(input_data)
        probability = model.predict_proba(input_data)
        
        # 返回结果
        return jsonify({
            'prediction': int(prediction[0]),
            'probability': float(probability[0][1]),
            'status': 'success'
        })
    except Exception as e:
        return jsonify({'error': str(e), 'status': 'failed'})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

2. 客户端调用示例

import requests
import json

# 准备请求数据
sample_data = {
    "Age": 35,
    "Gender": 1,
    "Income": 50000,
    "Purchase_Amount": 1200,
    "Purchase_Month": 6,
    "Edu_Bachelor": 1,
    "Edu_Master": 0
}

# 发送预测请求
response = requests.post(
    'http://localhost:5000/predict',
    headers={'Content-Type': 'application/json'},
    data=json.dumps(sample_data))

print("预测结果:", response.json())