# devtools::install_github("MrDomani/autofeat")
library(autofeat)
library(mlr)
#> Loading required package: ParamHelpers
#> 'mlr' is in maintenance mode since July 2019. Future development
#> efforts will go into its successor 'mlr3' (<https://mlr3.mlr-org.com>).

data("titanic_imputed", package = "DALEX")

i <- sample(1:nrow(titanic_imputed), size = round(0.7 * nrow(titanic_imputed)))

X_train <- data.matrix(titanic_imputed[i,c("age", "fare", "sibsp", "parch")])
y_train <- factor(titanic_imputed$survived[i])

X_test <- data.matrix(titanic_imputed[-i,c("age", "fare", "sibsp", "parch")])
y_test <- factor(titanic_imputed$survived[-i])

X_SAFEd <- SAFE(X_train, y_train, X_test, y_test, n_iter = 5)
#> 
#> 1th iteration out of 5 complete
#> 2th iteration out of 5 complete
#> 3th iteration out of 5 complete
#> 4th iteration out of 5 complete
#> 5th iteration out of 5 complete
X_train_SAFEd <- X_SAFEd$X_train
X_test_SAFEd <- X_SAFEd$X_valid

task1 <- makeClassifTask("original_train", data.frame(X_train, target = y_train), "target")
task2 <- makeClassifTask("SAFE_train", data.frame(X_train_SAFEd, target = y_train), "target")

task1_test <- makeClassifTask("original_test", data.frame(X_test, target = y_test), "target")
task2_test <- makeClassifTask("SAFE_test", data.frame(X_test_SAFEd, target = y_test), "target")

lrn <- makeLearner("classif.ranger", predict.type = "prob")

crossval(lrn, task1, measures = list(auc, ppv))$aggr
#> Resampling: cross-validation
#> Measures:             auc       ppv
#> [Resample] iter 1:    0.6858032 0.7313433
#> [Resample] iter 2:    0.7536364 0.7894737
#> [Resample] iter 3:    0.6273585 0.7037037
#> [Resample] iter 4:    0.6268222 0.7642276
#> [Resample] iter 5:    0.6339423 0.7111111
#> [Resample] iter 6:    0.6531120 0.7600000
#> [Resample] iter 7:    0.7471429 0.7680000
#> [Resample] iter 8:    0.6293333 0.7218045
#> [Resample] iter 9:    0.6657185 0.7841727
#> [Resample] iter 10:   0.6715335 0.7111111
#> 
#> Aggregated Result: auc.test.mean=0.6694403,ppv.test.mean=0.7444948
#> 
#> auc.test.mean ppv.test.mean 
#>     0.6694403     0.7444948
crossval(lrn, task2, measures = list(auc, ppv))$aggr
#> Resampling: cross-validation
#> Measures:             auc       ppv
#> [Resample] iter 1:    0.6813131 0.7899160
#> [Resample] iter 2:    0.7561275 0.7661290
#> [Resample] iter 3:    0.6760244 0.7500000
#> [Resample] iter 4:    0.6295782 0.6984127
#> [Resample] iter 5:    0.6667585 0.6818182
#> [Resample] iter 6:    0.6019656 0.7500000
#> [Resample] iter 7:    0.6210672 0.7521368
#> [Resample] iter 8:    0.7386783 0.7600000
#> [Resample] iter 9:    0.6503889 0.6953125
#> [Resample] iter 10:   0.5878899 0.7230769
#> 
#> Aggregated Result: auc.test.mean=0.6609792,ppv.test.mean=0.7366802
#> 
#> auc.test.mean ppv.test.mean 
#>     0.6609792     0.7366802

ranger1 <- train(lrn, task1)
ranger2 <- train(lrn, task2)

performance(predict(ranger1, task1), auc)
#>       auc 
#> 0.8685697
performance(predict(ranger2, task2), auc)
#>       auc 
#> 0.9096177

performance(predict(ranger1, task1_test), auc)
#>      auc 
#> 0.686658
performance(predict(ranger2, task2_test), auc)
#>       auc 
#> 0.6770653