1
+ import numpy as np
2
+ from sklearn .ensemble import RandomForestClassifier
3
+ from imblearn .over_sampling import SMOTE
4
+ from sklearn .model_selection import train_test_split
5
+ from sklearn .metrics import roc_auc_score
6
+
7
+ np .random .seed (42 )
8
+
9
+ # Generate random data
10
+ X = np .random .rand (10000 , 5 )
11
+ y = np .random .choice ([0 , 1 ], size = (10000 , ), p = [0.9 , 0.1 ])
12
+
13
+ # Let's measure accuracy score on test set with no oversampling
14
+ X_train , X_test , y_train , y_test = train_test_split (X , y , random_state = 42 )
15
+ rf = RandomForestClassifier (n_estimators = 100 )
16
+ rf .fit (X_train , y_train )
17
+ preds = rf .predict_proba (X_test )[:, 1 ]
18
+ print ('AUC no oversampling: {}' .format (roc_auc_score (y_test , preds )))
19
+
20
+ # Let's apply over_sampling on our train set and measure accuracy
21
+ smote = SMOTE ()
22
+ X_train_s , y_train_s = smote .fit_sample (X_train , y_train )
23
+ rf = RandomForestClassifier (n_estimators = 100 )
24
+ rf .fit (X_train_s , y_train_s )
25
+ preds = rf .predict_proba (X_test )[:, 1 ]
26
+ print ('AUC with oversampling after partitioning: {}' .format (roc_auc_score (y_test , preds )))
27
+
28
+ # Now let's first apply smote, then partition and measure accuracy
29
+ smote = SMOTE ()
30
+ X_s , y_s = smote .fit_sample (X , y )
31
+ X_train , X_test , y_train , y_test = train_test_split (X_s , y_s , random_state = 42 )
32
+ rf = RandomForestClassifier (n_estimators = 100 )
33
+ rf .fit (X_train , y_train )
34
+ preds = rf .predict_proba (X_test )[:, 1 ]
35
+ print ('AUC with oversampling before partitioning: {}' .format (roc_auc_score (y_test , preds )))
0 commit comments