diff --git a/Group5 Presentation.pptx b/Group5 Presentation.pptx new file mode 100644 index 0000000..aa2784d Binary files /dev/null and b/Group5 Presentation.pptx differ diff --git a/Group5 Presentation.pptx.pdf b/Group5 Presentation.pptx.pdf new file mode 100644 index 0000000..55fd24c Binary files /dev/null and b/Group5 Presentation.pptx.pdf differ diff --git a/MainCode.Rmd b/MainCode.Rmd index 128c212..55be9e1 100644 --- a/MainCode.Rmd +++ b/MainCode.Rmd @@ -840,6 +840,7 @@ for ( i in colnames(temp1)){ } } temp1 <- ROSE(TARGET~., data=temp1,N = 540000, seed=1)$data +temp1[is.na(temp1)] <- "NA" ##model @@ -862,7 +863,7 @@ lgb.valid = lgb.Dataset(data.matrix(va), label = va_ta) lgb.params<- list(objective = "binary", metric = "auc", num_leaves = 32, - max_depth=8, + max_depth=12, num_threads = detectCores(), min_data_in_leaf = 10, min_sum_hessian_in_leaf = 40, @@ -873,14 +874,14 @@ lgb.params<- list(objective = "binary", lambda_l2 = 0.073, min_gain_to_split=0.02 ) - lgb.model <- lgb.train(params = lgb.params, data = lgb.train, valids = list(val = lgb.valid), learning_rate = 0.02, - nrounds = 3000, + nrounds = 5000, early_stopping_rounds = 200, - eval_freq = 50 + eval_freq = 50, + boosting="dart" ) # Make prediction @@ -914,7 +915,8 @@ gbm.plot<-plot(roc(Target,gbm.probs)) ##save model -save(lgb.model,file = "lgbmModel.Rdata") +lgb.save(lgb.model, file = "lgbmModel.Rdata") + ``` ## 2.4 Support Vector Machine diff --git a/PredictionCode.Rmd b/PredictionCode.Rmd index 8fa6c82..608603d 100644 --- a/PredictionCode.Rmd +++ b/PredictionCode.Rmd @@ -588,7 +588,8 @@ names(data.dummified)<-make.names(names(data.dummified),unique = TRUE) ```{r} ## load lgbm model -lgb.model<-lgb.load("lgbmodel.model") +library(lightgbm) +lgb.model<-lgb.load(filename="lgbmModel.Rdata") temp1<-data @@ -607,5 +608,5 @@ lgb_pred<-ifelse(lgb_pred>0.5,1,0) #write back read_csv("submission.csv") %>% mutate(SK_ID_CURR = as.integer(SK_ID_CURR),TARGET = as.integer(lgb_pred))%>%select(1:2)%>% - write_csv("submission_Group5.csv") + write_csv("submission_Group5_aftersubmit.csv") ``` diff --git a/README.md b/README.md new file mode 100644 index 0000000..e7517f3 --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +# Default Risk Prediction + + + +## Overview + +This is the 2nd project of 8413 Business analyst class. + +The project is a subset of [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk/overview) , using credit card history and previous application as external dataset + +Instructor: JC Bonilla + +Team: Diwei Zhu, Gabriela Caballero, Kunyang Que, Ullas Srivastava, Yangxing Liu + +## Data Cleaning + + + +* application data + + Some columns in the dataset such as *Flag own a car* and *car age* are inter connected. We performed data validation check to see if there is logical error. For column like these, We replace + + We removed categorical data which are <1%. + +* credit card & previous application + + For a single *sk_ID_CURR* in application set, there are more than 1 record in these external set. We engineered statistic data such as approved rate, record count. + + + +## Model Training & selection + +* Logistic model + +* gbm +* lgbm + + + +## Outcome + +Lgbm model reached 73% accuracy on the given submission set \ No newline at end of file