Added python script that applies oversampling before partitioning on RANDOM data

GillesVandewiele · web-flow · commit f4250ad5a3dd · 2019-06-26T16:59:20.000+02:00
diff --git a/smote_random_data.py b/smote_random_data.py
@@ -0,0 +1,35 @@
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from imblearn.over_sampling import SMOTE
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score
+
+np.random.seed(42)
+
+# Generate random data
+X = np.random.rand(10000, 5)
+y = np.random.choice([0, 1], size=(10000, ), p=[0.9, 0.1])
+
+# Let's measure accuracy score on test set with no oversampling
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+rf = RandomForestClassifier(n_estimators=100)
+rf.fit(X_train, y_train)
+preds = rf.predict_proba(X_test)[:, 1]
+print('AUC no oversampling: {}'.format(roc_auc_score(y_test, preds)))
+
+# Let's apply over_sampling on our train set and measure accuracy
+smote = SMOTE()
+X_train_s, y_train_s = smote.fit_sample(X_train, y_train)
+rf = RandomForestClassifier(n_estimators=100)
+rf.fit(X_train_s, y_train_s)
+preds = rf.predict_proba(X_test)[:, 1]
+print('AUC with oversampling after partitioning: {}'.format(roc_auc_score(y_test, preds)))
+
+# Now let's first apply smote, then partition and measure accuracy
+smote = SMOTE()
+X_s, y_s = smote.fit_sample(X, y)
+X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, random_state=42)
+rf = RandomForestClassifier(n_estimators=100)
+rf.fit(X_train, y_train)
+preds = rf.predict_proba(X_test)[:, 1]
+print('AUC with oversampling before partitioning: {}'.format(roc_auc_score(y_test, preds)))