一、数据科学工作流技术栈
完整数据科学项目核心工具:
- Pandas:数据处理与分析
- NumPy:数值计算基础
- Matplotlib/Seaborn:数据可视化
- Scikit-learn:机器学习建模
- Flask:模型API部署
二、数据准备与清洗
1. 数据加载与探索
import pandas as pd
import numpy as np
# 加载数据集
df = pd.read_csv('sales_data.csv')
# 数据概览
print(f"数据集形状: {df.shape}")
print("n前5行数据:")
print(df.head())
# 统计摘要
print("n数值列统计摘要:")
print(df.describe())
# 缺失值检查
print("n缺失值统计:")
print(df.isnull().sum())
2. 数据清洗转换
# 处理缺失值
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Income'] = df['Income'].fillna(df.groupby('Education')['Income'].transform('mean'))
# 日期转换
df['Purchase_Date'] = pd.to_datetime(df['Purchase_Date'])
df['Purchase_Month'] = df['Purchase_Date'].dt.month
# 分类变量编码
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df = pd.get_dummies(df, columns=['Education'], prefix='Edu')
# 异常值处理
Q1 = df['Purchase_Amount'].quantile(0.25)
Q3 = df['Purchase_Amount'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Purchase_Amount'] (Q3 + 1.5 * IQR))]
三、探索性数据分析
1. 可视化分析
import matplotlib.pyplot as plt
import seaborn as sns
# 设置样式
sns.set(style="whitegrid")
# 数值分布分析
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['Age'], bins=20, kde=True)
plt.title('年龄分布')
plt.subplot(1, 2, 2)
sns.boxplot(x='Gender', y='Purchase_Amount', data=df)
plt.title('性别与消费金额')
plt.tight_layout()
plt.show()
# 相关性热力图
plt.figure(figsize=(10, 8))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('变量相关性热力图')
plt.show()
四、机器学习建模
1. 特征工程与模型训练
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# 准备特征和目标变量
X = df.drop(['CustomerID', 'Purchase_Date', 'Target'], axis=1)
y = df['Target']
# 数据集拆分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
# 创建建模管道
model = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(
n_estimators=100,
max_depth=5,
random_state=42))
])
# 模型训练
model.fit(X_train, y_train)
# 模型评估
y_pred = model.predict(X_test)
print("模型准确率:", accuracy_score(y_test, y_pred))
print("n分类报告:")
print(classification_report(y_test, y_pred))
2. 模型优化与解释
from sklearn.model_selection import GridSearchCV
import shap
# 超参数调优
param_grid = {
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("最佳参数:", grid_search.best_params_)
print("最佳分数:", grid_search.best_score_)
# SHAP值解释
explainer = shap.TreeExplainer(grid_search.best_estimator_.named_steps['classifier'])
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test, plot_type="bar")
五、模型部署与应用
1. Flask API开发
from flask import Flask, request, jsonify
import pickle
import pandas as pd
# 保存模型
with open('model.pkl', 'wb') as f:
pickle.dump(grid_search.best_estimator_, f)
# 创建Flask应用
app = Flask(__name__)
# 加载模型
with open('model.pkl', 'rb') as f:
model = pickle.load(f)
@app.route('/predict', methods=['POST'])
def predict():
try:
# 获取请求数据
data = request.get_json()
# 转换为DataFrame
input_data = pd.DataFrame([data])
# 预测
prediction = model.predict(input_data)
probability = model.predict_proba(input_data)
# 返回结果
return jsonify({
'prediction': int(prediction[0]),
'probability': float(probability[0][1]),
'status': 'success'
})
except Exception as e:
return jsonify({'error': str(e), 'status': 'failed'})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
2. 客户端调用示例
import requests
import json
# 准备请求数据
sample_data = {
"Age": 35,
"Gender": 1,
"Income": 50000,
"Purchase_Amount": 1200,
"Purchase_Month": 6,
"Edu_Bachelor": 1,
"Edu_Master": 0
}
# 发送预测请求
response = requests.post(
'http://localhost:5000/predict',
headers={'Content-Type': 'application/json'},
data=json.dumps(sample_data))
print("预测结果:", response.json())
