Using XGBoost to Predict Elo Customer Loyalty
1. Data Pre-Processing
1.1 Load the data
## check data
# load train
train <- fread('data/train.csv')
test <- fread('data/test.csv')
sample_submission <- fread('data/sample_submission.csv')
history <- fread('data/historical_transactions.csv')
new <- fread('data/new_merchant_transactions.csv')
merchant <- fread('data/merchants.csv')
1.2 Examing the data and feature engineering
After a quick examination of the data tables, we listed out features that we think that are important. We also list how we are going to manipulate on these features as well as the codes.
1.2.1 The train
table
- card age, which can be calculated from
first_active_month
feature_1
,feature_2
, andfeature_3
. We don’t know what these represent for since the data is anonymized. But there are only a few categories in each of these categorical variables, so we can simply transform them into factors.
Code for processing the features from train
:
target <- train[, .(card_id, target)]
train$target <- NULL
# save ids for train and test
id_tr <- train$card_id
id_te <- test$card_id
# bind train and test
data <- rbind(train, test)
ind_1 <- paste0('feature_1_', 1:5)
ind_2 <- paste0('feature_2_', 1:3)
ind_3 <- paste0('feature_3_', 0:1)
data[, `:=`(years_open = as.numeric(as.Date('2018-02-15') - as.Date( paste0(first_active_month,'-15')) )/365,
first_active_month = NULL,
feature_1 = factor(feature_1, levels = 1:5, labels = ind_1),
feature_2 = factor(feature_2, levels = 1:3, labels = ind_2),
feature_3 = factor(feature_3, levels = 0:1, labels = ind_3)
)]
data[, (ind_1[-1]) := lapply(ind_1[-1], function(x) as.numeric(feature_1 == x))]
data[, (ind_2[-1]) := lapply(ind_2[-1], function(x) as.numeric(feature_2 == x))]
data[, (ind_3[-1]) := lapply(ind_3[-1], function(x) as.numeric(feature_3 == x))]
data[, `:=` (feature_1 = NULL, feature_2 = NULL, feature_3 = NULL)]
rm(train, test); invisible(gc())
1.2.2 The history
table
- the frequency of transactions being denied (both count and ratio), which can be calculated from the column
authorized_flag
category_1
,category_2
,category_3
, these anonymized variables will be transformed into factors, then each factor level would be tallied level for each card. For each card, we can also look at the distribution among each category.- card usage over time, we can calculate a mean and a variance of the card usage over time.
- card debt situation, we can calculate the number of installments the card has and how that changes over time; another index would be the purchase amount
- the categories of purchases, customers might be more loyal to certain categories of purchase, e.g., I have a travel card dedicated for travel,
merchant_category_id
andsubsector_id
can be used to identify the spread and the mode of merchant category for each card.
The historical_transactions.csv
data is more complex. Let’s take a look at the summary first.
sum_hist <- history[!(purchase_amount > 100 & authorized_flag == 'N'),
.(n_trans = .N, # number of transactions
n_denied = sum(authorized_flag == 'N', na.rm = T), # number of denied transactions
ratio_denied = sum(authorized_flag == 'N', na.rm = T) / .N, # the ratio of denied transactions
cat_1_N = sum(category_1 == 'N', na.rm = T), # category 1 level counts
cat_1_Y = sum(category_1 == 'Y', na.rm = T),
cat_2_1 = sum(category_2 == 1, na.rm = T), # category 2 level counts
cat_2_2 = sum(category_2 == 2, na.rm = T),
cat_2_3 = sum(category_2 == 3, na.rm = T),
cat_2_4 = sum(category_2 == 4, na.rm = T),
cat_2_5 = sum(category_2 == 5, na.rm = T),
cat_3_A = sum(category_3 == 'A', na.rm = T), # category 3 level counts
cat_3_B = sum(category_3 == 'B', na.rm = T),
cat_3_C = sum(category_3 == 'C', na.rm = T),
spend = sum(purchase_amount[authorized_flag == 'Y'], na.rm = T),
n_installment_outlier = sum(installments < 0 | installments > 99, na.rm = T),
mean_installment = mean(installments[installments >= 0 & installments < 99], na.rm = T),
n_merchant = uniqueN(merchant_category_id),
n_subsector = uniqueN(paste0(merchant_category_id, subsector_id)),
n_city = uniqueN(paste0(state_id, city_id))
),
by = card_id]
Get some stats related to the trend of each card.
trend_hist <- history[!(purchase_amount > 100 & authorized_flag == 'N'),
.(n_trans = .N, # number of transactions
ratio_denied = sum(authorized_flag == 'N', na.rm = T) / .N, # the ratio of denied transactions
cat_1_N = sum(category_1 == 'N', na.rm = T), # category 1 level counts
cat_2_1 = sum(category_2 == 1, na.rm = T), # category 2 level counts
cat_2_2 = sum(category_2 == 2, na.rm = T),
cat_2_3 = sum(category_2 == 3, na.rm = T),
cat_2_4 = sum(category_2 == 4, na.rm = T),
cat_3_A = sum(category_3 == 'A', na.rm = T), # category 3 level counts
cat_3_B = sum(category_3 == 'B', na.rm = T),
spend = sum(purchase_amount[authorized_flag == 'Y'], na.rm = T),
mean_installment = mean(installments[installments >= 0 & installments < 99], na.rm = T),
n_merchant = uniqueN(merchant_category_id),
n_city = uniqueN(paste0(state_id, city_id))
),
by = .(card_id, month_lag)]
# trend_hist[is.na(mean_installment), mean_installment := (-1)]
Now, summarize trend into the variance over time and the delta over time.
rm(history); invisible(gc())
sum_trend_hist <- trend_hist[, .(
v_n_trans = var(n_trans),
d_n_trans = n_trans[which.max(month_lag)] -
n_trans[which.min(month_lag)],
v_ratio_denied = var(ratio_denied),
d_ratio_denied = ratio_denied[which.max(month_lag)] -
ratio_denied[which.min(month_lag)],
v_cat_1_N = var(cat_1_N),
d_cat_1_N = cat_1_N[which.max(month_lag)] -
cat_1_N[which.min(month_lag)],
v_cat_2_1 = var(cat_2_1),
d_cat_2_1 = cat_2_1[which.max(month_lag)] -
cat_2_1[which.min(month_lag)],
v_cat_2_2 = var(cat_2_2),
d_cat_2_2 = cat_2_2[which.max(month_lag)] -
cat_2_2[which.min(month_lag)],
v_cat_2_3 = var(cat_2_3),
d_cat_2_3 = cat_2_3[which.max(month_lag)] -
cat_2_3[which.min(month_lag)],
v_cat_2_4 = var(cat_2_4),
d_cat_2_4 = cat_2_4[which.max(month_lag)] -
cat_2_4[which.min(month_lag)],
v_cat_3_A = var(cat_3_A),
d_cat_3_A = cat_3_A[which.max(month_lag)] -
cat_3_A[which.min(month_lag)],
v_cat_3_B = var(cat_3_B),
d_cat_3_B = cat_3_B[which.max(month_lag)] -
cat_3_B[which.min(month_lag)],
v_spend = var(spend),
d_spend = spend[which.max(month_lag)] -
spend[which.min(month_lag)],
v_mean_installment = var(mean_installment),
d_mean_installment =
mean_installment[which.max(month_lag)] -
mean_installment[which.min(month_lag)],
v_n_merchant = var(n_merchant),
d_n_merchant = n_merchant[which.max(month_lag)] -
n_merchant[which.min(month_lag)],
v_n_city = var(n_city),
d_n_city = n_city[which.max(month_lag)] -
n_city[which.min(month_lag)]
), by = card_id
]
# sum_trend_hist[is.na(d_mean_installment), d_mean_installment := 0]
# sum_trend_hist[is.na(v_mean_installment), v_mean_installment := 0]
1.2.3 The new
table
The structure of this table is exactly the same as that of the history
table. The only difference is that the transction dates are newer, which may lead to different weights/treatments in the model. We apply the same engineering process to this table as we did for history
.
sum_new <- new[!(purchase_amount > 100 & authorized_flag == 'N'),
.(n_trans = .N, # number of transactions
n_denied = sum(authorized_flag == 'N', na.rm = T), # number of denied transactions
ratio_denied = sum(authorized_flag == 'N', na.rm = T) / .N, # the ratio of denied transactions
cat_1_N = sum(category_1 == 'N', na.rm = T), # category 1 level counts
cat_1_Y = sum(category_1 == 'Y', na.rm = T),
cat_2_1 = sum(category_2 == 1, na.rm = T), # category 2 level counts
cat_2_2 = sum(category_2 == 2, na.rm = T),
cat_2_3 = sum(category_2 == 3, na.rm = T),
cat_2_4 = sum(category_2 == 4, na.rm = T),
cat_2_5 = sum(category_2 == 5, na.rm = T),
cat_3_A = sum(category_3 == 'A', na.rm = T), # category 3 level counts
cat_3_B = sum(category_3 == 'B', na.rm = T),
cat_3_C = sum(category_3 == 'C', na.rm = T),
spend = sum(purchase_amount[authorized_flag == 'Y'], na.rm = T),
n_installment_outlier = sum(installments < 0 | installments > 99, na.rm = T),
mean_installment = mean(installments[installments >= 0 & installments < 99], na.rm = T),
n_merchant = uniqueN(merchant_category_id),
n_subsector = uniqueN(paste0(merchant_category_id, subsector_id)),
n_city = uniqueN(paste0(state_id, city_id))
),
by = card_id]
# sum_new[is.na(mean_installment), mean_installment := (-1)]
Get some stats related to the trend of each card in the new transactions.
trend_new <- new[!(purchase_amount > 100 & authorized_flag == 'N'),
.(n_trans = .N, # number of transactions
ratio_denied = sum(authorized_flag == 'N', na.rm = T) / .N, # the ratio of denied transactions
cat_1_N = sum(category_1 == 'N', na.rm = T), # category 1 level counts
cat_2_1 = sum(category_2 == 1, na.rm = T), # category 2 level counts
cat_2_2 = sum(category_2 == 2, na.rm = T),
cat_2_3 = sum(category_2 == 3, na.rm = T),
cat_2_4 = sum(category_2 == 4, na.rm = T),
cat_3_A = sum(category_3 == 'A', na.rm = T), # category 3 level counts
cat_3_B = sum(category_3 == 'B', na.rm = T),
spend = sum(purchase_amount[authorized_flag == 'Y'], na.rm = T),
mean_installment = mean(installments[installments >= 0 & installments < 99], na.rm = T),
n_merchant = uniqueN(merchant_category_id),
n_city = uniqueN(paste0(state_id, city_id))
),
by = .(card_id, month_lag)]
# trend_new[is.na(mean_installment), mean_installment := (-1)]
Now, summarize trend into the variance over time and the delta over time in the new transactions data.
rm(new); invisible(gc())
sum_trend_new <- trend_new[, .(
v_n_trans = var(n_trans),
d_n_trans = n_trans[which.max(month_lag)] -
n_trans[which.min(month_lag)],
v_ratio_denied = var(ratio_denied),
d_ratio_denied = ratio_denied[which.max(month_lag)] -
ratio_denied[which.min(month_lag)],
v_cat_1_N = var(cat_1_N),
d_cat_1_N = cat_1_N[which.max(month_lag)] -
cat_1_N[which.min(month_lag)],
v_cat_2_1 = var(cat_2_1),
d_cat_2_1 = cat_2_1[which.max(month_lag)] -
cat_2_1[which.min(month_lag)],
v_cat_2_2 = var(cat_2_2),
d_cat_2_2 = cat_2_2[which.max(month_lag)] -
cat_2_2[which.min(month_lag)],
v_cat_2_3 = var(cat_2_3),
d_cat_2_3 = cat_2_3[which.max(month_lag)] -
cat_2_3[which.min(month_lag)],
v_cat_2_4 = var(cat_2_4),
d_cat_2_4 = cat_2_4[which.max(month_lag)] -
cat_2_4[which.min(month_lag)],
v_cat_3_A = var(cat_3_A),
d_cat_3_A = cat_3_A[which.max(month_lag)] -
cat_3_A[which.min(month_lag)],
v_cat_3_B = var(cat_3_B),
d_cat_3_B = cat_3_B[which.max(month_lag)] -
cat_3_B[which.min(month_lag)],
v_spend = var(spend),
d_spend = spend[which.max(month_lag)] -
spend[which.min(month_lag)],
v_mean_installment = var(mean_installment),
d_mean_installment =
mean_installment[which.max(month_lag)] -
mean_installment[which.min(month_lag)],
v_n_merchant = var(n_merchant),
d_n_merchant = n_merchant[which.max(month_lag)] -
n_merchant[which.min(month_lag)],
v_n_city = var(n_city),
d_n_city = n_city[which.max(month_lag)] -
n_city[which.min(month_lag)]
), by = card_id
]
# sum_trend_new[is.na(v_n_trans), v_n_trans := 0]
# sum_trend_new[is.na(v_ratio_denied), v_ratio_denied := 0]
# sum_trend_new[is.na(v_cat_1_N), v_cat_1_N := 0]
# sum_trend_new[is.na(v_cat_2_1), v_cat_2_1 := 0]
# sum_trend_new[is.na(v_cat_2_2), v_cat_2_2 := 0]
# sum_trend_new[is.na(v_cat_2_3), v_cat_2_3 := 0]
# sum_trend_new[is.na(v_cat_2_4), v_cat_2_4 := 0]
# sum_trend_new[is.na(v_cat_3_A), v_cat_3_A := 0]
# sum_trend_new[is.na(v_cat_3_B), v_cat_3_B := 0]
# sum_trend_new[is.na(v_spend), v_spend := 0]
# sum_trend_new[is.na(v_mean_installment), v_mean_installment := 0]
# sum_trend_new[is.na(v_n_merchant), v_n_merchant := 0]
# sum_trend_new[is.na(v_n_city), v_n_city := 0]
1.3 Merge the features
We merge all the features we obtained so far into a single table train_dt
.
data_dt <- merge( merge( merge( merge(data, sum_hist, by = 'card_id', all.x = T),
sum_trend_hist, by = 'card_id' , all.x = T),
sum_new, by = 'card_id', all.x = T),
sum_trend_new, by = 'card_id', all.x = T)
train_dt <- data_dt[card_id %in% id_tr,]
train_dt <- merge(train_dt, target, by = 'card_id', all.x = T)
test_dt <- data_dt[card_id %in% id_te,]
save(test_dt, train_dt, target, file = 'elo_xgb_v3.Rdata')
1.4 Additional notes about the data
- About the purchase amount, there are some extreme high values in
purchase_amount
in transactions. While the majority of the values are below 0, some values are up to 6 million! We looked at these data and seems they might be entry errors since a lot of them were not authorized, e.g., wrong digits by the cashier/merchant. We can exclude these transactions if they were not authorized. - There are a lot of
NA
s in thecategory_2
in historical transactions data, these seem to be out-of-country transactions, since thestate_id
and thecity_id
were both-1
for these transactions, we can calculate the number of these transactions, which can be a proxy be how much/often the customzer traveled abroad. - There are some
-1
s formerchant_category_id
andsubsector_id
, a lot of which is due to transactions happening abroad (state_id = -1
, probably in foreign language and hard to classify). There were about 40 with normal state and city id, these are probably just peculiar transactions via platforms like esty.
2. Training the model: xgboost
We chose to use xgboost
as our model.
set.seed(33)
train_dt <- train_dt[target >= (-10) & target <= 10]
# use mean instead of rmse for measure aggregation
train_dt[is.nan(d_mean_installment.x), d_mean_installment.x := NA]
train_dt[is.nan(d_mean_installment.y), d_mean_installment.y := NA]
train_dt[is.nan(mean_installment.y), mean_installment.y := NA]
train_dt$card_id <- NULL
#### Train Model ####
start.time <- Sys.time()
train_task <- makeRegrTask(data = train_dt, target = 'target')
getParamSet('regr.xgboost')
## Type len Def Constr
## booster discrete - gbtree gbtree,gblinear,dart
## watchlist untyped - <NULL> -
## eta numeric - 0.3 0 to 1
## gamma numeric - 0 0 to Inf
## max_depth integer - 6 1 to Inf
## min_child_weight numeric - 1 0 to Inf
## subsample numeric - 1 0 to 1
## colsample_bytree numeric - 1 0 to 1
## colsample_bylevel numeric - 1 0 to 1
## num_parallel_tree integer - 1 1 to Inf
## lambda numeric - 1 0 to Inf
## lambda_bias numeric - 0 0 to Inf
## alpha numeric - 0 0 to Inf
## objective untyped - reg:linear -
## eval_metric untyped - rmse -
## base_score numeric - 0.5 -Inf to Inf
## max_delta_step numeric - 0 0 to Inf
## missing numeric - -Inf to Inf
## monotone_constraints integervector <NA> 0 -1 to 1
## tweedie_variance_power numeric - 1.5 1 to 2
## nthread integer - - 1 to Inf
## nrounds integer - 1 1 to Inf
## feval untyped - <NULL> -
## verbose integer - 1 0 to 2
## print_every_n integer - 1 1 to Inf
## early_stopping_rounds integer - <NULL> 1 to Inf
## maximize logical - <NULL> -
## sample_type discrete - uniform uniform,weighted
## normalize_type discrete - tree tree,forest
## rate_drop numeric - 0 0 to 1
## skip_drop numeric - 0 0 to 1
## callbacks untyped - <list> -
## Req Tunable Trafo
## booster - TRUE -
## watchlist - FALSE -
## eta - TRUE -
## gamma - TRUE -
## max_depth - TRUE -
## min_child_weight - TRUE -
## subsample - TRUE -
## colsample_bytree - TRUE -
## colsample_bylevel - TRUE -
## num_parallel_tree - TRUE -
## lambda - TRUE -
## lambda_bias - TRUE -
## alpha - TRUE -
## objective - FALSE -
## eval_metric - FALSE -
## base_score - FALSE -
## max_delta_step - TRUE -
## missing - FALSE -
## monotone_constraints - TRUE -
## tweedie_variance_power Y TRUE -
## nthread - FALSE -
## nrounds - TRUE -
## feval - FALSE -
## verbose - FALSE -
## print_every_n Y FALSE -
## early_stopping_rounds - FALSE -
## maximize - FALSE -
## sample_type Y TRUE -
## normalize_type Y TRUE -
## rate_drop Y TRUE -
## skip_drop Y TRUE -
## callbacks - FALSE -
#make learner with inital parameters
learner <- makeLearner("regr.xgboost")
#define parameters for tuning
params <- makeParamSet(
makeIntegerParam("max_depth",lower=1,upper=10),
makeNumericParam("lambda",lower=0,upper=10),
makeNumericParam("eta", lower = 0.001, upper = 0.3),
makeNumericParam("subsample", lower = 0.10, upper = 0.80),
makeNumericParam("min_child_weight",lower=0.9,upper=5),
makeNumericParam("colsample_bytree",lower = 0.2,upper = 0.8))
#define search function
rancontrol <- makeTuneControlRandom(maxit = 30L) #do 100 iterations
# 5 fold cross validation
set_cv <- makeResampleDesc("CV",iters = 5L)
# tune parameters
setAggregation(measure = rmse, aggr = test.mean)
## Name: Root mean squared error
## Performance measure: rmse
## Properties: regr,req.pred,req.truth
## Minimize: TRUE
## Best: 0; Worst: Inf
## Aggregated by: test.mean
## Arguments:
## Note: The RMSE is aggregated as sqrt(mean(rmse.vals.on.test.sets^2)). If you don't want that, you could also use `test.mean`.
tune <- tuneParams(learner = learner,
task = train_task,
resampling = set_cv,
measures = rmse,
par.set = params,
control = rancontrol)
## [Tune] Started tuning learner regr.xgboost for parameter set:
## Type len Def Constr Req Tunable Trafo
## max_depth integer - - 1 to 10 - TRUE -
## lambda numeric - - 0 to 10 - TRUE -
## eta numeric - - 0.001 to 0.3 - TRUE -
## subsample numeric - - 0.1 to 0.8 - TRUE -
## min_child_weight numeric - - 0.9 to 5 - TRUE -
## colsample_bytree numeric - - 0.2 to 0.8 - TRUE -
## With control class: TuneControlRandom
## Imputation value: Inf
## [Tune-x] 1: max_depth=8; lambda=4.81; eta=0.0345; subsample=0.255; min_child_weight=4.97; colsample_bytree=0.589
## [Tune-y] 1: rmse.test.rmse=1.7645043; time: 0.1 min
## [Tune-x] 2: max_depth=4; lambda=9.6; eta=0.227; subsample=0.272; min_child_weight=4.14; colsample_bytree=0.543
## [Tune-y] 2: rmse.test.rmse=1.7402907; time: 0.1 min
## [Tune-x] 3: max_depth=10; lambda=7.02; eta=0.257; subsample=0.744; min_child_weight=2.37; colsample_bytree=0.479
## [Tune-y] 3: rmse.test.rmse=1.7375269; time: 0.1 min
## [Tune-x] 4: max_depth=10; lambda=0.476; eta=0.281; subsample=0.438; min_child_weight=4.63; colsample_bytree=0.465
## [Tune-y] 4: rmse.test.rmse=1.7352470; time: 0.1 min
## [Tune-x] 5: max_depth=10; lambda=1.78; eta=0.147; subsample=0.52; min_child_weight=1.68; colsample_bytree=0.343
## [Tune-y] 5: rmse.test.rmse=1.7521340; time: 0.1 min
## [Tune-x] 6: max_depth=10; lambda=2.45; eta=0.0457; subsample=0.213; min_child_weight=2.34; colsample_bytree=0.235
## [Tune-y] 6: rmse.test.rmse=1.7676664; time: 0.1 min
## [Tune-x] 7: max_depth=5; lambda=4.94; eta=0.221; subsample=0.177; min_child_weight=2.09; colsample_bytree=0.796
## [Tune-y] 7: rmse.test.rmse=1.7084184; time: 0.1 min
## [Tune-x] 8: max_depth=6; lambda=5.1; eta=0.104; subsample=0.285; min_child_weight=4.5; colsample_bytree=0.265
## [Tune-y] 8: rmse.test.rmse=1.7587794; time: 0.1 min
## [Tune-x] 9: max_depth=2; lambda=1.3; eta=0.2; subsample=0.638; min_child_weight=4.18; colsample_bytree=0.464
## [Tune-y] 9: rmse.test.rmse=1.7444550; time: 0.1 min
## [Tune-x] 10: max_depth=6; lambda=2.69; eta=0.0741; subsample=0.187; min_child_weight=3.98; colsample_bytree=0.529
## [Tune-y] 10: rmse.test.rmse=1.7626382; time: 0.1 min
## [Tune-x] 11: max_depth=5; lambda=2.54; eta=0.187; subsample=0.658; min_child_weight=1.14; colsample_bytree=0.572
## [Tune-y] 11: rmse.test.rmse=1.7236885; time: 0.1 min
## [Tune-x] 12: max_depth=5; lambda=0.0369; eta=0.277; subsample=0.259; min_child_weight=2.14; colsample_bytree=0.696
## [Tune-y] 12: rmse.test.rmse=1.6928579; time: 0.1 min
## [Tune-x] 13: max_depth=6; lambda=9.18; eta=0.0253; subsample=0.593; min_child_weight=3.25; colsample_bytree=0.748
## [Tune-y] 13: rmse.test.rmse=1.7663249; time: 0.1 min
## [Tune-x] 14: max_depth=1; lambda=0.726; eta=0.177; subsample=0.103; min_child_weight=4.11; colsample_bytree=0.701
## [Tune-y] 14: rmse.test.rmse=1.7328596; time: 0.1 min
## [Tune-x] 15: max_depth=3; lambda=4.66; eta=0.0759; subsample=0.77; min_child_weight=3.97; colsample_bytree=0.678
## [Tune-y] 15: rmse.test.rmse=1.7536960; time: 0.1 min
## [Tune-x] 16: max_depth=8; lambda=0.389; eta=0.158; subsample=0.248; min_child_weight=2.62; colsample_bytree=0.342
## [Tune-y] 16: rmse.test.rmse=1.7505078; time: 0.1 min
## [Tune-x] 17: max_depth=10; lambda=5.68; eta=0.138; subsample=0.65; min_child_weight=4.69; colsample_bytree=0.777
## [Tune-y] 17: rmse.test.rmse=1.7278849; time: 0.1 min
## [Tune-x] 18: max_depth=8; lambda=4.23; eta=0.114; subsample=0.544; min_child_weight=1.92; colsample_bytree=0.722
## [Tune-y] 18: rmse.test.rmse=1.7361475; time: 0.1 min
## [Tune-x] 19: max_depth=8; lambda=5.52; eta=0.192; subsample=0.659; min_child_weight=2.85; colsample_bytree=0.65
## [Tune-y] 19: rmse.test.rmse=1.7211587; time: 0.1 min
## [Tune-x] 20: max_depth=7; lambda=3.24; eta=0.125; subsample=0.456; min_child_weight=1.86; colsample_bytree=0.212
## [Tune-y] 20: rmse.test.rmse=1.7558817; time: 0.1 min
## [Tune-x] 21: max_depth=4; lambda=1.87; eta=0.112; subsample=0.536; min_child_weight=2.41; colsample_bytree=0.692
## [Tune-y] 21: rmse.test.rmse=1.7402109; time: 0.1 min
## [Tune-x] 22: max_depth=6; lambda=7; eta=0.162; subsample=0.609; min_child_weight=1.09; colsample_bytree=0.401
## [Tune-y] 22: rmse.test.rmse=1.7494700; time: 0.1 min
## [Tune-x] 23: max_depth=2; lambda=2.52; eta=0.091; subsample=0.786; min_child_weight=2.7; colsample_bytree=0.599
## [Tune-y] 23: rmse.test.rmse=1.7508730; time: 0.1 min
## [Tune-x] 24: max_depth=9; lambda=9.59; eta=0.25; subsample=0.679; min_child_weight=4.35; colsample_bytree=0.595
## [Tune-y] 24: rmse.test.rmse=1.7072511; time: 0.1 min
## [Tune-x] 25: max_depth=5; lambda=6.98; eta=0.245; subsample=0.221; min_child_weight=2.99; colsample_bytree=0.269
## [Tune-y] 25: rmse.test.rmse=1.7398043; time: 0.1 min
## [Tune-x] 26: max_depth=2; lambda=4.66; eta=0.032; subsample=0.592; min_child_weight=1.99; colsample_bytree=0.345
## [Tune-y] 26: rmse.test.rmse=1.7696255; time: 0.1 min
## [Tune-x] 27: max_depth=1; lambda=8.31; eta=0.245; subsample=0.52; min_child_weight=2.98; colsample_bytree=0.563
## [Tune-y] 27: rmse.test.rmse=1.7193074; time: 0.1 min
## [Tune-x] 28: max_depth=7; lambda=8.81; eta=0.177; subsample=0.559; min_child_weight=4.28; colsample_bytree=0.704
## [Tune-y] 28: rmse.test.rmse=1.7171898; time: 0.1 min
## [Tune-x] 29: max_depth=6; lambda=0.525; eta=0.0758; subsample=0.229; min_child_weight=3.81; colsample_bytree=0.596
## [Tune-y] 29: rmse.test.rmse=1.7528551; time: 0.1 min
## [Tune-x] 30: max_depth=9; lambda=6.48; eta=0.201; subsample=0.678; min_child_weight=4.82; colsample_bytree=0.247
## [Tune-y] 30: rmse.test.rmse=1.7454749; time: 0.1 min
## [Tune] Result: max_depth=5; lambda=0.0369; eta=0.277; subsample=0.259; min_child_weight=2.14; colsample_bytree=0.696 : rmse.test.rmse=1.6928579
# set parameters
new_learner <- setHyperPars(learner = learner, par.vals = tune$x)
## train model
new_model <- mlr::train(new_learner, train_task)
# get training time and print
end.time <- Sys.time()
print('Training Time:')
## [1] "Training Time:"
print(time.taken <- round(end.time-start.time,2))
## Time difference of 3.55 mins
3. Prediction
id_te <- test_dt$card_id
test_dt$card_id <- NULL
pred <- predict(new_model, newdata = test_dt)
pred_dt <- data.table(card_id = id_te,
target = pred$data$response)
pred_dt <- pred_dt[match(sample_submission$card_id, id_te)]
if (! dir.exists('output') ) dir.create('output')
fwrite(x = pred_dt, file = 'output/sub_7.csv')