Skip to content

Commit f4250ad

Browse files
Added python script that applies oversampling before partitioning on RANDOM data
1 parent 2dc84c7 commit f4250ad

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed

smote_random_data.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import numpy as np
2+
from sklearn.ensemble import RandomForestClassifier
3+
from imblearn.over_sampling import SMOTE
4+
from sklearn.model_selection import train_test_split
5+
from sklearn.metrics import roc_auc_score
6+
7+
np.random.seed(42)
8+
9+
# Generate random data
10+
X = np.random.rand(10000, 5)
11+
y = np.random.choice([0, 1], size=(10000, ), p=[0.9, 0.1])
12+
13+
# Let's measure accuracy score on test set with no oversampling
14+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
15+
rf = RandomForestClassifier(n_estimators=100)
16+
rf.fit(X_train, y_train)
17+
preds = rf.predict_proba(X_test)[:, 1]
18+
print('AUC no oversampling: {}'.format(roc_auc_score(y_test, preds)))
19+
20+
# Let's apply over_sampling on our train set and measure accuracy
21+
smote = SMOTE()
22+
X_train_s, y_train_s = smote.fit_sample(X_train, y_train)
23+
rf = RandomForestClassifier(n_estimators=100)
24+
rf.fit(X_train_s, y_train_s)
25+
preds = rf.predict_proba(X_test)[:, 1]
26+
print('AUC with oversampling after partitioning: {}'.format(roc_auc_score(y_test, preds)))
27+
28+
# Now let's first apply smote, then partition and measure accuracy
29+
smote = SMOTE()
30+
X_s, y_s = smote.fit_sample(X, y)
31+
X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, random_state=42)
32+
rf = RandomForestClassifier(n_estimators=100)
33+
rf.fit(X_train, y_train)
34+
preds = rf.predict_proba(X_test)[:, 1]
35+
print('AUC with oversampling before partitioning: {}'.format(roc_auc_score(y_test, preds)))

0 commit comments

Comments
 (0)