# 聊聊样本采样技术

• 1）推荐系统中，曝光行为要远大于用户的点击行为，而点击行为又要远大于交易行为；
• 2）在欺诈交易检测中，欺诈交易订单是占总交易数量非常少的一部分的；
• 3）在工厂质量检测中，合格产品的数量是远大于不合格产品的；
• 4）信用卡的征信中，信用正常的用户是远多于信用有风险的用户的。

• 所有样本的预测结果为：
• 模型的准确率为

## 〇、数据&Baseline

### 1、数据生成

from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000,
n_classes=2,
weights=[0.95, 0.05],
class_sep=1.2,
n_features=8,
n_informative=4,
n_redundant=1,
n_clusters_per_class=1)

from sklearn.decomposition import PCA
def pca_draw(X, y):
pca = PCA(n_components=2)
pca.fit(X)
n_X = pca.transform(X)
pos_X, neg_X = n_X[y==1], n_X[y==0]
pos_X.shape, neg_X.shape
plt.scatter(pos_X[:,0], pos_X[:,1], c='r', linewidths=0.5)
plt.scatter(neg_X[:,0], neg_X[:,1], c='b', linewidths=0.5)

pca_draw(X, y)
print('n_pos = %d, n_neg=%d'%(np.sum(y==1), np.sum(y==0)))
# n_pos = 550, n_neg=9450

### 2、训练-测试样本划分

trn_idx = np.array([1 if random.random()<0.7 else 0 for i in range(X.shape[0])])
trn_X = X[trn_idx==1]
trn_y = y[trn_idx==1]
tst_X = X[trn_idx==0]
tst_y = y[trn_idx==0]

### 3、评估方法

def evaluate(y_true, y_pred):
pos_recall = np.sum(y_pred[y_true==1]>0.5)/np.sum(y_true==1)
neg_precision = np.sum(y_true[y_pred<0.5]<0.5)/np.sum(y_pred<0.5)
return pos_recall, neg_precision

### 4、Baseline

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

def lr_predict(trn_X, trn_y, tst_X, tst_y, class_weight=None):
C_list = [0.001, 0.01, 0.1, 1, 10, 100]
bst_c = C_list[0]
bst_m = 0
for c in C_list:
model = LogisticRegression(C=c, class_weight=class_weight)
scores = cross_val_score(model, trn_X, trn_y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('C = %.3f --> auc: %.5f (%.5f), bst: %.5f' % (c, np.mean(scores), np.std(scores), bst_m))
if np.mean(scores)>bst_m:
bst_c = c
bst_m = np.mean(scores)
model = LogisticRegression(C=bst_c, class_weight=class_weight)
model.fit(trn_X, trn_y)

tst_pred = model.predict_proba(tst_X)
p_r, n_p = evaluate(tst_y, tst_pred[:,1])
print('Precison on Majarity: %.3f, Recall on Minority: %.3f'%(n_p, p_r))

## 一、样本加权

w_pos = np.sum(trn_y==0)/trn_y.shape[0]
w_neg = np.sum(trn_y==1)/trn_y.shape[0]
• w_pos = 0.945, w_neg = 0.055

## 二、多数类样本下采样

### 1、随机下采样

def random_undersampling(X, y, balanced_ratio=0.5):
'''
balanced_ratio = n_pos/n_neg
'''
ratio = (np.sum(y==1)/np.sum(y==0))/balanced_ratio
n_under_sampling_idx = np.array([0 if y[i]==0 and random.random()>ratio else 1 for i in range(y.shape[0])])
n_X = X[n_under_sampling_idx==1]
n_y = y[n_under_sampling_idx==1]
print('before: n_pos= %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos= %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_X, n_y

n_trn_X, n_trn_y = random_undersampling(trn_X, trn_y, balanced_ratio=0.5)
pca_draw(n_trn_X, n_trn_y)

'''
before: n_pos= 391, n_neg = 6672
after: n_pos= 391, n_neg = 748
'''

### 2、NearMiss

NearMiss是Man等人在2003年提出来的，其核心思想是：保留多数类中距离少数类中最近的k个样本的平均距离之和最小的点，其中k是超参数。

from sklearn.neighbors import DistanceMetric

def near_miss(X, y, k, balanced_ratio=0.5):
pos = X[y==1]
neg = X[y==0]

n_sample = int(pos.shape[0]/balanced_ratio)

dist = DistanceMetric.get_metric('euclidean')
n_d = dist.pairwise(neg, pos)
top_k_idx = n_d.argsort()[:,:k]
dist_2_k_pos = np.mean([n_d[i][top_k_idx[i]] for i in range(trn_neg.shape[0])], axis=1)
sampled_S = neg[dist_2_k_pos.argsort()[:n_sample]]

n_X = np.vstack((sampled_S, pos))
n_y = np.array([0]*sampled_S.shape[0] + [1]*pos.shape[0])
print('before: n_pos= %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos= %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_X, n_y

n_trn_X, n_trn_y =near_miss(trn_X, trn_y, k=3, balanced_ratio=0.5)
'''
before: n_pos= 391, n_neg = 6672
after: n_pos= 391, n_neg = 782
'''

### 3、Condense Nearest Neighbor（CNN）

1. S中随机选择一个样本p，并加入到U中：U={p}
2. S-U中扫描样本，将第一个满足条件（其在U中最近的样本与它自己具有不同类别）的样本p加入到U
3. 持续运行第二步，直到U不在增加
from sklearn.neighbors import DistanceMetric

def condensed_nn(X, y):
S_fea = X
S_idx = [i for i in range(1, X.shape[0])]
U = [0]

dist = DistanceMetric.get_metric('euclidean')

for i in range(X.shape[0]):
prev_len_u = len(U)

n_s_idx = np.array(S_idx)
n_u_idx = np.array(U)
n_d = dist.pairwise(S_fea[n_s_idx], S_fea[n_u_idx])

for i in range(n_d.shape[0]):
nearest_u_idx = n_u_idx[n_d[i].argsort()[0]]
p_idx = n_s_idx[i]
if y[nearest_u_idx] != y[p_idx]:
U.append(p_idx)
S_idx.remove(p_idx)
break
if prev_len_u == len(U):
break

n_U = np.array(list(set(U + [i for i in range(y.shape[0]) if y[i]==1])))
n_X, n_y = X[np.array(n_U)], y[np.array(n_U)]
print('before: n_pos= %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos= %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_X, n_y

n_trn_X, n_trn_y = condensed_nn(trn_X, trn_y)
'''
before: n_pos= 391, n_neg = 6672
after: n_pos= 391, n_neg = 557
'''

### 4、Edited Nearest Neighbor（ENN）

def edited_nn(X, y, k, isthrd=True, islog=True):
thrd = 1 if not isthrd else k/2
neg = X[y==0]
dist = DistanceMetric.get_metric('euclidean')
n_d = dist.pairwise(neg, X)
top_k_idx = n_d.argsort()[:,:k+1]
top_k_label = np.array([y[top_k_idx[i]] for i in range(top_k_idx.shape[0])])
n_X = np.vstack((neg[np.sum(top_k_label, axis=1)<thrd], X[y==1]))
n_y = np.array([0]*np.sum(np.sum(top_k_label, axis=1)<thrd) + [1]*np.sum(y==1))

print('before: n_pos= %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos= %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_X, n_y

n_trn_X, n_trn_y = edited_nn(trn_X, trn_y, k=15)
'''
before: n_pos= 391, n_neg = 6672
after: n_pos= 391, n_neg = 5850
'''

def tomek_link_rm(X, y):
dist = DistanceMetric.get_metric('euclidean')
n_d = dist.pairwise(X, X)
top_k_idx = n_d.argsort()[:,:2]

tomek_link_idx = [[top_k_idx[i][0], top_k_idx[i][1]] for i in range(top_k_idx.shape[0]) if np.sum(y[top_k_idx[i]])==1]

rm_pos_idx = [idx for idx in set([pair[0] for pair in tomek_link_idx] + [pair[1] for pair in tomek_link_idx]) if y[idx]==0]
retained_flag = np.array([1]*X.shape[0])
retained_flag[np.array(rm_pos_idx)]=0

n_X = X[retained_flag==1]
n_y = y[retained_flag==1]

print('before: n_pos= %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos= %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_trn_X, n_trn_y

'''
tomek_link pairs = 145, removed pos samples = 116
before: n_pos= 391, n_neg = 6672
after: n_pos= 391, n_neg = 6556
'''

## 三、少数类样本上采样

### 1、随机上采样

def random_oversampling(X, y, balanced_ratio=0.5):
'''
balanced_ratio = n_pos/n_neg
'''
r = np.sum(y==1)/np.sum(y==0)
if r > balanced_ratio:
return X, y

n_over_pos = int(np.sum(y==0)*balanced_ratio - np.sum(y==1))
pos_idx = [i for i in range(y.shape[0]) if y[i]==1]
sampled_idx = np.random.choice(pos_idx, n_over_pos)
n_X = np.vstack((X, X[sampled_idx]))
n_y = np.hstack((y, y[sampled_idx]))

print('before: n_pos = %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos = %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_X, n_y

n_trn_X, n_trn_y = random_oversampling(X=trn_X, y=trn_y, balanced_ratio=0.5)
'''
before: n_pos = 391, n_neg = 6672
after: n_pos = 3336, n_neg = 6672
'''

### 2、SMOTE

SMOTE（Synthetic Minority Oversampling Technique）的基本思想是对少数类样本进行分析并生成人工合成新样本，具体如下图所示：

SMOTE步骤如下：

1. 计算样本  的最近  个样本
2. 随机从  个近邻样本中选出一个样本  ，并在  和  的连线上随机生产样本
3. 重复步骤2，知道生成满足数量的人工样本

from imblearn.over_sampling import SMOTE

def smote(X, y, balanced_ratio=0.5):
n_pos = int(np.sum(y==0)*0.5)
model = SMOTE(sampling_strategy={1:n_pos})
n_X, n_y = model.fit_resample(X, y)
print('before: n_pos = %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos = %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_X, n_y

n_trn_X, n_trn_y = smote(trn_X, trn_y, balanced_ratio=0.25)
'''
before: n_pos = 391, n_neg = 6672
after: n_pos = 3336, n_neg = 6672
'''

### 3、Borderline-SMOTE

Borderline SMOTE是在SMOTE基础上改进的少数类样本上采样算法。不同于SMOTE算法，Borderline SMOTE仅使用边界上的少数类样本来生成新的样本，从而改善样本的类别分布（因此，Borderline SMOTE又被称为SVM-SMOTE）。 Borderline SMOTE在采样过程中将少数类样本分为3类：NOISE、SAFE和DANGER。

1. 从所有样本集合  中计算  的  个最近邻样本，记为
2. 求  与所有多数类样本的交集，记为
3. 判断样本  的类型：
4. 如果  ，则将  记为NOISY
5. 如果  ，则将  记为SAFE
6. 如果  ，则将  记为DANGER

from imblearn.over_sampling import BorderlineSMOTE

def borderline_smote(X, y, balanced_ratio=0.5):
n_pos = int(np.sum(y==0)*0.5)
model = BorderlineSMOTE(sampling_strategy={1:n_pos})
n_X, n_y = model.fit_resample(X, y)
print('before: n_pos = %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos = %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_X, n_y

n_trn_X, n_trn_y = borderline_smote(trn_X, trn_y, balanced_ratio=0.25)
'''
before: n_pos = 391, n_neg = 6672
after: n_pos = 3336, n_neg = 6672
'''

## 四、下采样 & 上采样

### 1、SMOTE + Tomek link removal

def combine_smote_tomek(X, y, balanced_ratio):
# 采用bordline SMOTE方法来对正样本上采样
n_X, n_y = borderline_smote(n_X, n_y, balanced_ratio=balanced_ratio)
print('before: n_pos = %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos = %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_X, n_y

n_trn_X, n_trn_y = combine_smote_tomek(trn_X, trn_y, balanced_ratio=0.25)
'''
before: n_pos= 391, n_neg = 6672
after: n_pos = 3336, n_neg = 6672
'''

### 2、SMOTE + ENN

def combine_enn(X, y, k, balanced_ratio):
n_X, n_y = edited_nn(X, y, k)
# 采用bordline SMOTE方法来对正样本上采样
n_X, n_y = borderline_smote(n_X, n_y, balanced_ratio=balanced_ratio)
print('before: n_pos = %d, n_neg = %d'%(np.sum(y==1), np.sum(y==0)))
print('after: n_pos = %d, n_neg = %d'%(np.sum(n_y==1), np.sum(n_y==0)))
return n_X, n_y

n_trn_X, n_trn_y = combine_enn(trn_X, trn_y, k=7, balanced_ratio=0.25)
'''
before: n_pos= 391, n_neg = 6672
after: n_pos = 3143, n_neg = 6287
'''

## 五、集成方法

### 1、EasyEnsemble

EasyEnsemble算法是通过对多数类样本进行多次重采样，构造平衡多组平衡数据集，来训练若干个分类器进行集成学习。算法的具体步骤如下：

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
def EasyEnsemble(trn_X, trn_y, tst_X, tst_y, clf, n_iter):
num_pos = np.sum(trn_y)
trn_pos = trn_X[trn_y==1]
trn_neg = trn_X[trn_y==0]
neg_idx = list(range(trn_neg.shape[0]))

tst_pred = np.empty((tst_X.shape[0], n_iter))
trn_pred = np.empty((trn_X.shape[0], n_iter))

for i in range(n_iter):
classifier = clf()
# Sample the same number of negative cases as positive cases
# neg_sample = trn_neg.sample(num_pos, random_state = i)
sampled_neg_idx = np.random.choice(neg_idx, num_pos)
neg_sample = trn_neg[sampled_neg_idx]
x_sub = np.vstack((neg_sample, trn_pos))
y_sub = np.array([0]*neg_sample.shape[0] + [1]*trn_pos.shape[0])

# Fit the classifier to the balanced dataset
classifier.fit(x_sub, y_sub)
pred = classifier.predict_proba(tst_X)[:,1]
tst_pred[:, i] = pred

# Average all the test predictions
ensemble_pred = np.mean(tst_pred, axis = 1)
p_r, n_p = evaluate(tst_y, ensemble_pred)
print('[%s] Precison on Majarity: %.3f, Recall on Minority: %.3f'%(clf.__name__, n_p, p_r))

EasyEnsemble(trn_X, trn_y, tst_X, tst_y, clf=LogisticRegression, n_iter=20)
EasyEnsemble(trn_X, trn_y, tst_X, tst_y, clf=GradientBoostingClassifier, n_iter=20)
EasyEnsemble(trn_X, trn_y, tst_X, tst_y, clf=RandomForestClassifier, n_iter=20)

def BalanceCascade(trn_X, trn_y, tst_X, tst_y, clf, n_iter):
neg_num = np.sum(trn_y == 0)
pos_num = np.sum(trn_y == 1)
neg_idx = np.argwhere(trn_y == 0).reshape(neg_num, )
pos_idx = np.argwhere(trn_y == 1).reshape(pos_num, )
pos_trn = trn_X[pos_idx]

FP = pow(pos_num/neg_num, 1/(n_iter-1))
classifiers = {}
thresholds = {}
tst_prob = np.empty((tst_X.shape[0], n_iter))
for i in range(n_iter):
classifiers[i] = clf()
neg_trn_idx = np.random.permutation(neg_idx)[:pos_num]
neg_trn = trn_X[neg_trn_idx]
sub_X = np.vstack((pos_trn, neg_trn))
sub_y = np.array([1]*pos_trn.shape[0] + [0]*neg_trn.shape[0])
classifiers[i].fit(sub_X, sub_y)
trn_pred = classifiers[i].predict_proba(trn_X[neg_idx])[:,1]

thresholds[i] = np.sort(trn_pred)[int(neg_idx.shape[0]*(1-FP))]
neg_idx = np.argwhere(trn_pred >= thresholds[i]).reshape(-1, )
tst_prob[:,i] = classifiers[i].predict_proba(tst_X)[:,1] + thresholds[i]

ensemble_pred = np.average(tst_prob, axis=1)
p_r, n_p = evaluate(tst_y, ensemble_pred)
print('[%s] Precison on Majarity: %.3f, Recall on Minority: %.3f'%(clf.__name__, n_p, p_r))

BalanceCascade(trn_X, trn_y, tst_X, tst_y, clf=LogisticRegression, n_iter=20)
BalanceCascade(trn_X, trn_y, tst_X, tst_y, clf=RandomForestClassifier, n_iter=20)

## 总结

1. LR在对于不平衡数据的处理效果并不好：倾向于将大多数样本判别为负样本，从而导致少数类样本的召回率只有16.4%
2. 总的来看，Ensemble方法的效果在处理不平衡问题生效果最好。但集成方法需要重复训练多个子分类器，效率相对较低；
3. 对样本进行加权后的LR相比于原始LR在正样本召回和负样本准确率上均有显著提升；
4. 在下采样方法中，NearMiss效果要优于其他下采样方法；
5. 上采样方法中，SMOTEBorderline-SMOTE要明显好于随机上采样。

## 参考文献

[1]. sklearn: scikit-learn.org/stable

[2]. imblearn: imbalanced-learn.org/

[3]. Adaptive Weight Optimization for Classification of Imbalanced Data

[4]. Inderjeet Mani and I Zhang. knn approach to unbalanced data distributions: a case study involving information extraction. In Proceedings of workshop on learning from imbalanced datasets, 2003.

[5]. P. Hart. The condensed nearest neighbor rule. IEEE Transactions on Information Theory, 14(3):515–516, September 2006.

[6]. Dennis L Wilson. Asymptotic properties of nearest neighbor rules using edited data. IEEE Transactions on Systems, Man, and Cybernetics, (3):408–421, 1972.

[7]. Ivan Tomek. Two modifications of cnn. IEEE Transactions on Systems, Man and Cybernetics, 6:769–772, 1976.

[8]. Nitesh V. Chawla, Kevin W. Bowyer, Lawrence O. Hall, and W. Philip Kegelmeyer. Smote: synthetic minority over-sampling technique. Journal of artificial intelligence research, 16:321–357, 2002.

[9]. Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote: a new over-sampling method in imbalanced data sets learning. In International Conference on Intelligent Computing, pages 878–887. Springer, 2005.

[10]. Xu-Ying Liu, Jianxin Wu, and Zhi-Hua Zhou. Exploratory undersampling for class-imbalance learning. IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), 39(2):539–550, 2009.