浙大疏锦行
代码极简逻辑
1. 数据预处理
读取信贷csv,分离特征与违约标签;文本特征编码;查看样本是否不平衡。
2. 数据集拆分
分层划分训练/测试集,保证两组违约样本比例一致。
3. 搭建4套对比流水线(防数据泄露)
- 基线:标准化+随机森林(对照组)
- SMOTE过采样:合成少数违约样本平衡数据
- SMOTEENN混合采样:过采样+剔除噪声
- 权重平衡:不改动数据,训练时加重少数类损失
4. 网格搜索+分层5折交叉验证
批量训练4套模型,以F1为优化目标,输出召回率、精确率、AUC。
5. 横向对比
汇总所有方案指标,自动选出F1最高的最优模型。
6. 阈值优化
不用默认0.5阈值,通过PR曲线找到F1最佳分割点,优化风控预测效果并绘图展示。
7. 可选
保存最优模型用于后续预测。
#DAY 15 不平衡数据集的处理
============================================================
信贷数据集 + 缺失值填充 + 不平衡处理 + 交叉验证 + 超参数调优
============================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score, precision_score, roc_auc_score, precision_recall_curve
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
import warnings
warnings.filterwarnings(“ignore”)
中文绘图设置
plt.rcParams[‘font.sans-serif’] = [‘SimHei’, ‘Microsoft YaHei’, ‘DejaVu Sans’]
plt.rcParams[‘axes.unicode_minus’] = False
1.读取数据
file_path = r"C:\Python Study\Python60DaysChallenge-main\data.csv"
data = pd.read_csv(file_path)
print(“数据集形状:”, data.shape)
print(“\n数据集全部列名:”)
print(data.columns.tolist())
TARGET_COL = ‘Credit Default’
分离特征标签
X = data.drop(columns=[TARGET_COL])
y = data[TARGET_COL]
区分数值列、文本分类列
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=[‘object’]).columns.tolist()
打印缺失值数量
print(“\n各列缺失值统计:”)
print(X.isnull().sum())
文本特征编码
le_dict = {}
for col in cat_cols:
le = LabelEncoder()
X[col] = le.fit_transform(X[col].astype(str).fillna(“Missing”))
le_dict[col] = le
数值缺失先简单填充(预处理阶段临时处理)
X[num_cols] = SimpleImputer(strategy=“median”).fit_transform(X[num_cols])
类别分布可视化
print(“\n原始数据集目标变量分布:”)
print(y.value_counts())
plt.figure(figsize=(6,4))
sns.countplot(x=y)
plt.title(‘信贷违约类别分布’)
plt.show()
2.分层划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\n训练集分布: {pd.Series(y_train).value_counts().to_dict()}“)
print(f"测试集分布: {pd.Series(y_test).value_counts().to_dict()}”)
3.基础配置
base_clf = RandomForestClassifier(random_state=42)
param_grid_common = {
‘classifier__n_estimators’: [50, 100],
‘classifier__max_depth’: [5, 10],
‘classifier__min_samples_split’: [2, 5]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
4.四条流水线(数据已提前填充无NaN,SMOTE可正常运行)
基线
pipeline_baseline = ImbPipeline([
(‘scaler’, StandardScaler()),
(‘classifier’, base_clf)
])
param_baseline = param_grid_common.copy()
SMOTE
pipeline_smote = ImbPipeline([
(‘scaler’, StandardScaler()),
(‘sampler’, SMOTE(random_state=42)),
(‘classifier’, base_clf)
])
param_smote = {**param_grid_common, ‘sampler__k_neighbors’: [3, 5]}
SMOTEENN
pipeline_smotenn = ImbPipeline([
(‘scaler’, StandardScaler()),
(‘sampler’, SMOTEENN(random_state=42)),
(‘classifier’, base_clf)
])
param_smotenn = param_grid_common.copy()
权重平衡
pipeline_weighted = ImbPipeline([
(‘scaler’, StandardScaler()),
(‘classifier’, RandomForestClassifier(random_state=42, class_weight=‘balanced’))
])
param_weighted = param_grid_common.copy()
5.网格搜索函数
def run_gridsearch(pipeline, param_grid, name):
print(f"\n{‘=’*60}“)
print(f"正在运行策略: {name}”)
print(f"{‘=’*60}“)
gs = GridSearchCV(
pipeline, param_grid, cv=cv, scoring=‘f1’, n_jobs=-1, verbose=1
)
gs.fit(X_train, y_train)
print(f"最佳参数组合: {gs.best_params_}”)
print(f"交叉验证最佳 F1 (平均): {gs.best_score_:.4f}")
y_pred = gs.best_estimator_.predict(X_test) test_f1 = f1_score(y_test, y_pred) test_recall = recall_score(y_test, y_pred) test_precision = precision_score(y_test, y_pred) test_auc = roc_auc_score(y_test, gs.best_estimator_.predict_proba(X_test)[:,1]) print(f"测试集 F1: {test_f1:.4f}") print(f"测试集 召回率(Recall): {test_recall:.4f}") print(f"测试集 精确率(Precision): {test_precision:.4f}") print(f"测试集 AUC: {test_auc:.4f}") print("\n分类报告:") print(classification_report(y_test, y_pred)) print("混淆矩阵:") print(confusion_matrix(y_test, y_pred)) return gs.best_estimator_, test_f1, test_recall, test_precision, test_auc6.批量训练对比
results = {}
best_models = {}
strategies = [
(‘Baseline’, pipeline_baseline, param_baseline),
(‘SMOTE’, pipeline_smote, param_smote),
(‘SMOTEENN’, pipeline_smotenn, param_smotenn),
(‘Weighted’, pipeline_weighted, param_weighted)
]
for name, pipe, params in strategies:
model, f1, rec, prec, auc = run_gridsearch(pipe, params, name)
results[name] = {‘F1’: f1, ‘Recall’: rec, ‘Precision’: prec, ‘AUC’: auc}
best_models[name] = model
7.结果汇总
print(“\n\n” + “=”*60)
print(“各策略性能对比”)
print(“=”*60)
df_results = pd.DataFrame(results).T
print(df_results.round(4))
best_strategy = df_results[‘F1’].idxmax()
best_model = best_models[best_strategy]
print(f"\n最优策略: {best_strategy},F1 = {df_results.loc[best_strategy, ‘F1’]:.4f}")
8.阈值调优绘图
print(“\n阈值微调”)
y_proba = best_model.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
fscores = 2 * (precisions * recalls) / (precisions + recalls + 1e-9)
ix = np.argmax(fscores[:-1])
best_threshold = thresholds[ix]
print(f"最优阈值: {best_threshold:.4f}“)
print(f"对应F1:{fscores[ix]:.4f} 召回:{recalls[ix]:.4f} 精确:{precisions[ix]:.4f}”)
y_pred_new = (y_proba >= best_threshold).astype(int)
print(“\n调整阈值后分类报告:”)
print(classification_report(y_test, y_pred_new))
绘图
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(thresholds, precisions[:-1], ‘–’, label=‘Precision’)
plt.plot(thresholds, recalls[:-1], ‘:’, label=‘Recall’)
plt.plot(thresholds, fscores[:-1], linewidth=2, label=‘F1’)
plt.scatter(best_threshold, fscores[ix], c=‘red’, s=100)
plt.xlabel(“Threshold”)
plt.grid(True)
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(recalls, precisions)
plt.scatter(recalls[ix], precisions[ix], c=‘red’, s=100)
plt.xlabel(“Recall”)
plt.ylabel(“Precision”)
plt.grid(True)
plt.tight_layout()
plt.show()
print(“全部执行完毕!”)