chwdy · chwdy · Nov 26, 2020 · Nov 26, 2020 · Nov 26, 2020
diff --git a/Group5 Presentation.pptx b/Group5 Presentation.pptx
diff --git a/Group5 Presentation.pptx.pdf b/Group5 Presentation.pptx.pdf
diff --git a/MainCode.Rmd b/MainCode.Rmd
@@ -840,6 +840,7 @@ for ( i in colnames(temp1)){
   }
 }
 temp1 <- ROSE(TARGET~., data=temp1,N = 540000, seed=1)$data
+temp1[is.na(temp1)] <- "NA"
 
 ##model
 
@@ -862,7 +863,7 @@ lgb.valid = lgb.Dataset(data.matrix(va), label = va_ta)
 lgb.params<- list(objective = "binary",
                   metric = "auc",
                   num_leaves = 32,
-                  max_depth=8,
+                  max_depth=12,
                   num_threads = detectCores(),
                   min_data_in_leaf = 10,
                   min_sum_hessian_in_leaf = 40,
@@ -873,14 +874,14 @@ lgb.params<- list(objective = "binary",
                   lambda_l2 = 0.073,
                   min_gain_to_split=0.02
                   )
-
 lgb.model <- lgb.train(params = lgb.params,
                        data = lgb.train,
                        valids = list(val = lgb.valid),
                        learning_rate = 0.02,
-                       nrounds = 3000,
+                       nrounds = 5000,
                        early_stopping_rounds = 200,
-                       eval_freq = 50
+                       eval_freq = 50,
+                       boosting="dart"
                        )
 
 # Make prediction
@@ -914,7 +915,8 @@ gbm.plot<-plot(roc(Target,gbm.probs))
 
 ##save model
 
-save(lgb.model,file = "lgbmModel.Rdata")
+lgb.save(lgb.model, file = "lgbmModel.Rdata")
+
 ```
 
 ## 2.4 Support Vector Machine

diff --git a/PredictionCode.Rmd b/PredictionCode.Rmd
@@ -588,7 +588,8 @@ names(data.dummified)<-make.names(names(data.dummified),unique = TRUE)
 ```{r}
 
 ## load lgbm model
-lgb.model<-lgb.load("lgbmodel.model")
+library(lightgbm)
+lgb.model<-lgb.load(filename="lgbmModel.Rdata")
 
 temp1<-data
 
@@ -607,5 +608,5 @@ lgb_pred<-ifelse(lgb_pred>0.5,1,0)
 #write back
 read_csv("submission.csv") %>%  
   mutate(SK_ID_CURR = as.integer(SK_ID_CURR),TARGET = as.integer(lgb_pred))%>%select(1:2)%>%
-  write_csv("submission_Group5.csv")
+  write_csv("submission_Group5_aftersubmit.csv")
 ```
diff --git a/README.md b/README.md
@@ -0,0 +1,42 @@
+#  Default Risk Prediction
+
+
+
+##  Overview 
+
+This is the 2nd project of 8413 Business analyst class.
+
+The project is a subset of [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk/overview) , using credit card history and previous application as external dataset
+
+Instructor: JC Bonilla
+
+Team: Diwei Zhu, Gabriela Caballero, Kunyang Que, Ullas Srivastava, Yangxing Liu
+
+##  Data Cleaning
+
+
+
+* application data
+
+  Some columns in the dataset such as *Flag own a car* and *car age* are inter connected. We performed data validation check to see if there is logical error. For column like these, We replace
+
+  We removed categorical data which are <1%.
+
+* credit card & previous application
+
+  For a single *sk_ID_CURR* in application set, there are more than 1 record in these external set. We engineered statistic data such as approved rate, record count.
+
+
+
+##  Model Training & selection
+
+* Logistic model
+
+* gbm
+* lgbm
+
+
+
+##  Outcome
+
+Lgbm model reached 73% accuracy on the given submission set