Linear Regression to Predict Interest Rate
Intro
This is a short intro to linear regression with exampls inspired by my Data Science for Business class at LBS.
Libraries used within the code
library(tidyverse) # the usual stuff: dplyr, readr, and other goodies
library(lubridate) # to handle dates
library(GGally) # for correlation-scatterplot matrix
library(car) # vif() function to check for multicolinearity
library(ggfortify) # to produce residual diagnostic plots
library(rsample) # to split dataframe in training- & testing sets
library(here) # to read files and organise data
library(janitor) # clean_names()
library(broom) # use broom:augment() to get tidy table with regression output, residuals, etc
library(huxtable) # to get summary table of all models produced
library(caret) # to train more advanced models (k-fold cross-validation, stepwise regression, LASSO)
library(zoo) #to allow for time series operations
library(here)
library (usmap)
Load and prepare the data
We start by loading the data to R in a dataframe.
lc_raw <- read_csv(here::here("csv","LendingClub Data.csv"), skip=1) %>% #since the first row is a title we want to skip it.
clean_names() # use janitor::clean_names()
ICE the data: Inspect, Clean, Explore
Any data science engagement starts with ICE. Inspect, Clean and Explore the data.
glimpse(lc_raw)
## Rows: 42,538
## Columns: 80
## $ int_rate <dbl> 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.…
## $ loan_amnt <dbl> 8000, 6000, 6500, 8000, 5500, 6000, 10200, 15000, …
## $ term_months <dbl> 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36…
## $ installment <dbl> 241.28, 180.96, 196.04, 241.28, 165.88, 180.96, 30…
## $ dti <dbl> 2.11, 5.73, 17.68, 22.71, 5.75, 21.92, 18.62, 9.72…
## $ delinq_2yrs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ annual_inc <dbl> 50000, 52800, 35352, 79200, 240000, 89000, 110000,…
## $ grade <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", …
## $ emp_title <chr> NA, "coral graphics", NA, "Honeywell", "O T Plus, …
## $ emp_length <chr> "5 years", "< 1 year", "n/a", "n/a", "10+ years", …
## $ home_ownership <chr> "MORTGAGE", "MORTGAGE", "MORTGAGE", "MORTGAGE", "M…
## $ verification_status <chr> "Verified", "Source Verified", "Not Verified", "Ve…
## $ issue_d <chr> "9/1/2011", "9/1/2011", "9/1/2011", "9/1/2011", "9…
## $ zip_code <chr> "977xx", "228xx", "864xx", "322xx", "278xx", "760x…
## $ addr_state <chr> "OR", "VA", "AZ", "FL", "NC", "TX", "CT", "WA", "N…
## $ loan_status <chr> "Fully Paid", "Charged Off", "Fully Paid", "Fully …
## $ desc <chr> "Borrower added on 09/08/11 > Consolidating debt f…
## $ purpose <chr> "debt_consolidation", "vacation", "credit_card", "…
## $ title <chr> "Credit Card Payoff $8K", "bad choice", "Credit Ca…
## $ x20 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x21 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x22 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x23 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x24 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x25 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x26 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x27 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x28 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x29 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x30 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x31 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x32 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x33 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x34 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x35 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x36 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x37 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x38 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x39 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x40 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x41 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x42 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x43 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x44 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x45 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x46 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x47 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x48 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x49 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x50 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x51 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x52 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x53 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x54 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x55 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x56 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x57 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x58 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x59 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x60 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x61 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x62 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x63 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x64 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x65 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x66 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x67 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x68 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x69 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x70 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x71 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x72 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x73 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x74 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x75 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x76 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x77 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x78 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x79 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ x80 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
lc_clean<- lc_raw %>%
dplyr::select(-x20:-x80) %>% #delete empty columns
filter(!is.na(int_rate)) %>% #delete empty rows
mutate(
issue_d = mdy(issue_d), # lubridate::mdy() to fix date format
term = factor(term_months), # turn 'term' into a categorical variable
delinq_2yrs = factor(delinq_2yrs) # turn 'delinq_2yrs' into a categorical variable
) %>%
dplyr::select(-emp_title,-installment, -term_months, everything()) #move some not-so-important variables to the end.
glimpse(lc_clean)
## Rows: 37,869
## Columns: 20
## $ int_rate <dbl> 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.…
## $ loan_amnt <dbl> 8000, 6000, 6500, 8000, 5500, 6000, 10200, 15000, …
## $ dti <dbl> 2.11, 5.73, 17.68, 22.71, 5.75, 21.92, 18.62, 9.72…
## $ delinq_2yrs <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ annual_inc <dbl> 50000, 52800, 35352, 79200, 240000, 89000, 110000,…
## $ grade <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", …
## $ emp_length <chr> "5 years", "< 1 year", "n/a", "n/a", "10+ years", …
## $ home_ownership <chr> "MORTGAGE", "MORTGAGE", "MORTGAGE", "MORTGAGE", "M…
## $ verification_status <chr> "Verified", "Source Verified", "Not Verified", "Ve…
## $ issue_d <date> 2011-09-01, 2011-09-01, 2011-09-01, 2011-09-01, 2…
## $ zip_code <chr> "977xx", "228xx", "864xx", "322xx", "278xx", "760x…
## $ addr_state <chr> "OR", "VA", "AZ", "FL", "NC", "TX", "CT", "WA", "N…
## $ loan_status <chr> "Fully Paid", "Charged Off", "Fully Paid", "Fully …
## $ desc <chr> "Borrower added on 09/08/11 > Consolidating debt f…
## $ purpose <chr> "debt_consolidation", "vacation", "credit_card", "…
## $ title <chr> "Credit Card Payoff $8K", "bad choice", "Credit Ca…
## $ term <fct> 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36…
## $ term_months <dbl> 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36…
## $ installment <dbl> 241.28, 180.96, 196.04, 241.28, 165.88, 180.96, 30…
## $ emp_title <chr> NA, "coral graphics", NA, "Honeywell", "O T Plus, …
The data is now in a clean format stored in the dataframe “lc_clean.”
Explore the Data through Visualisation
# histogram of Interest Rates
lc_clean%>%
ggplot(aes(int_rate))+
geom_histogram()+
theme_bw()+
labs(title = "Histogram of Interest Rates",
x = "interest rate",
y = "count")
# Histogram of interest rates but using different color for loans of different grades
lc_clean%>%
ggplot(aes(int_rate, fill = grade))+
geom_histogram()+
theme_bw()+
labs(title = "Histogram of Interest Rates by Loan Grade",
x = "interest rate",
y = "count")
# Scatter plot of loan amount against interest rate and add visually the line of best fit
lc_clean%>%
ggplot(aes(y = int_rate, x = loan_amnt))+
geom_point()+
geom_smooth(method = "lm")+
theme_bw()+
labs(title = "Loan amount vs Interest rate",
y = "interest rate",
x = "loan amount")
# Scatter plot of annual income against interest rate and add visually the line of best fit
lc_clean%>%
ggplot(aes(y = int_rate, x = annual_inc))+
geom_point()+
geom_smooth(method = "lm", se = F)+
theme_bw()+
labs(title = "Annual Income vs Interest rate",
y = "interest rate",
x = "Annual Income")
# scale_x_log10()
# In the same axes, produce box plots of the interest rate for every value of delinquencies
lc_clean%>%
ggplot(aes(y = int_rate, x = delinq_2yrs))+
geom_boxplot()+
theme_bw()+
labs(title = "Interest rates by Deliquency",
y = "interest rate",
x = "deliquency")
# Interest Rate over time
lc_clean%>%
ggplot(aes(x= issue_d, y = int_rate))+
geom_point()+
geom_smooth()+
theme_bw()+
labs(title = "Interest Rates over time",
x = "date",
y = "interest rate")
# Interest rate over time by grade
lc_clean%>%
ggplot(aes(x= issue_d, y = int_rate))+
geom_point()+
geom_smooth()+
theme_bw()+
labs(title = "Interest Rates over time by Grade",
x = "date",
y = "interest rate")+
facet_wrap(~grade)
map_us <- lc_clean%>%
mutate(state = addr_state)%>%
group_by(state)%>%
summarise(med_annual_inc = median(annual_inc))
# US map with median annual income distribution by state
plot_usmap(regions = "state", data = map_us, values = "med_annual_inc") +
labs(title = "Median Annual income by State",
subtitle = "") +
scale_fill_continuous(low = "white",
high = "darkgreen",
name = "Median Annual income",
label = scales::comma)+
theme(panel.background=element_blank(), legend.position = "right")
# Histogram of interest rate by home ownership
lc_clean%>%
ggplot(aes(int_rate, fill = home_ownership))+
geom_histogram()+
theme_bw()+
labs(title = "Histogram of Interest Rates by Home Ownership",
x = "interest rate",
y = "count")
Estimate simple linear regression models
We start with a simple but quite powerful model.
# Use the lm command to estimate a regression model with the following variables "loan_amnt", "term", "dti", "annual_inc", and "grade"
model1<-lm(data = lc_clean, int_rate~loan_amnt+term+dti+annual_inc+grade)
summary(model1)
##
## Call:
## lm(formula = int_rate ~ loan_amnt + term + dti + annual_inc +
## grade, data = lc_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.118827 -0.007035 -0.000342 0.006828 0.035081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.169e-02 1.689e-04 424.363 < 2e-16 ***
## loan_amnt 1.475e-07 8.284e-09 17.809 < 2e-16 ***
## term60 3.608e-03 1.419e-04 25.431 < 2e-16 ***
## dti 4.328e-05 8.269e-06 5.234 1.66e-07 ***
## annual_inc -9.734e-10 9.283e-10 -1.049 0.294
## gradeB 3.554e-02 1.492e-04 238.248 < 2e-16 ***
## gradeC 6.016e-02 1.658e-04 362.783 < 2e-16 ***
## gradeD 8.172e-02 1.906e-04 428.746 < 2e-16 ***
## gradeE 9.999e-02 2.483e-04 402.660 < 2e-16 ***
## gradeF 1.195e-01 3.673e-04 325.408 < 2e-16 ***
## gradeG 1.355e-01 6.208e-04 218.245 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01056 on 37858 degrees of freedom
## Multiple R-squared: 0.9198, Adjusted R-squared: 0.9197
## F-statistic: 4.34e+04 on 10 and 37858 DF, p-value: < 2.2e-16
Let us consider the following Questions:
Are all variables statistically significant?
Interpret all the coefficients in the regression.
How much explanatory power does the model have?
How wide would the 95% confidence interval of any prediction based on this model be?
Every variable except for annual_inc is significant because they have p values below 0.05
Coefficient Interpretation:
- loan_amnt: When your loan amount increases by 1 then your interest rate will increase by \(7.169*10^{-2}\). Or if the loan amount increases by \(100\) interest rate increases by \(7.169\).
- Term60: The default for term is 36 so if you have a loan term of 60 then your interest rate will increase by \(3.608*10^{-3}\).
- Dti: When your dti increases by 1 then your interest rate will increase by \(4.328*10^{5}\). Or if the dti increases by \(100000\) interest rate increases by \(4.328\).
- annual_inc: When your annual income increases by 1 then your interest rate will decreases by \(9.734*10^{-10}\). Or if the annual income decreases by \(10,000,000,000\) interest rate increases by \(9.734\).
- GradeB: The default for Grade is A so if you have a grade of B then your interest rate will increase by \(3.554*10^{-2}\).
- GradeC: The default for Grade is A so if you have a grade of C then your interest rate will increase by \(6.016*10^{-2}\).
- GradeD: The default for Grade is A so if you have a grade of D then your interest rate will increase by \(8.172*10^{-2}\).
- GradeE: The default for Grade is A so if you have a grade of E then your interest rate will increase by \(9.999*10^{-2}\).
- GradeF: The default for Grade is A so if you have a grade of F then your interest rate will increase by \(1.195*20^{-1}\).
- GradeG: The default for Grade is A so if you have a grade of G then your interest rate will increase by \(1.355*10^{-1}\).
- This model has an \(Adjusted R^2=0.9197\) which means that it is explaining about 92% of the variability in the data. This is a very strong model.
- The 95% prediction interval can be found doing \(\pm 1.96*0.01056\). Giving us \(\left[-0.0206976;+0.0206976 \right]\Rightarrow \left[ -2.06976 \%;+2.06976 \% \right]\)
Feature Engineering
Let’s build progressively more complex models, with more features exploring how the model improves.
#Add to model 1 an interaction between loan amount and grade. Use the "var1*var2" notation to define an interaction term in the linear regression model. This will add the interaction and the individual variables to the model.
model2 <- lm(data = lc_clean, int_rate~loan_amnt*grade+term+dti+annual_inc)
summary(model2)
##
## Call:
## lm(formula = int_rate ~ loan_amnt * grade + term + dti + annual_inc,
## data = lc_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.119807 -0.007230 -0.000057 0.006588 0.037460
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.171e-02 2.345e-04 305.762 < 2e-16 ***
## loan_amnt 1.528e-07 2.028e-08 7.537 4.91e-14 ***
## gradeB 3.623e-02 2.732e-04 132.619 < 2e-16 ***
## gradeC 6.198e-02 2.988e-04 207.452 < 2e-16 ***
## gradeD 8.082e-02 3.483e-04 232.034 < 2e-16 ***
## gradeE 9.633e-02 4.625e-04 208.293 < 2e-16 ***
## gradeF 1.143e-01 7.735e-04 147.699 < 2e-16 ***
## gradeG 1.327e-01 1.551e-03 85.580 < 2e-16 ***
## term60 3.793e-03 1.424e-04 26.636 < 2e-16 ***
## dti 3.836e-05 8.250e-06 4.649 3.34e-06 ***
## annual_inc -1.224e-09 9.249e-10 -1.324 0.18564
## loan_amnt:gradeB -6.617e-08 2.441e-08 -2.710 0.00673 **
## loan_amnt:gradeC -1.704e-07 2.621e-08 -6.500 8.14e-11 ***
## loan_amnt:gradeD 6.703e-08 2.798e-08 2.395 0.01662 *
## loan_amnt:gradeE 2.209e-07 3.016e-08 7.323 2.47e-13 ***
## loan_amnt:gradeF 2.779e-07 4.136e-08 6.720 1.85e-11 ***
## loan_amnt:gradeG 1.265e-07 7.260e-08 1.743 0.08140 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01052 on 37852 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9204
## F-statistic: 2.735e+04 on 16 and 37852 DF, p-value: < 2.2e-16
#Add to the model you just created above the square and the cube of annual income. Use the poly(var_name,3) command as a variable in the linear regression model.
model3 <- lm(data = lc_clean, int_rate~loan_amnt*grade+term+dti+poly(annual_inc,3))
summary(model3)
##
## Call:
## lm(formula = int_rate ~ loan_amnt * grade + term + dti + poly(annual_inc,
## 3), data = lc_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.119814 -0.007238 -0.000065 0.006594 0.037471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.161e-02 2.279e-04 314.200 < 2e-16 ***
## loan_amnt 1.556e-07 2.047e-08 7.600 3.03e-14 ***
## gradeB 3.623e-02 2.732e-04 132.611 < 2e-16 ***
## gradeC 6.198e-02 2.988e-04 207.398 < 2e-16 ***
## gradeD 8.081e-02 3.483e-04 231.999 < 2e-16 ***
## gradeE 9.633e-02 4.625e-04 208.283 < 2e-16 ***
## gradeF 1.143e-01 7.736e-04 147.695 < 2e-16 ***
## gradeG 1.327e-01 1.551e-03 85.573 < 2e-16 ***
## term60 3.786e-03 1.426e-04 26.546 < 2e-16 ***
## dti 3.761e-05 8.294e-06 4.535 5.79e-06 ***
## poly(annual_inc, 3)1 -1.589e-02 1.117e-02 -1.423 0.1549
## poly(annual_inc, 3)2 6.296e-03 1.086e-02 0.580 0.5621
## poly(annual_inc, 3)3 -8.981e-03 1.084e-02 -0.828 0.4076
## loan_amnt:gradeB -6.633e-08 2.442e-08 -2.717 0.0066 **
## loan_amnt:gradeC -1.703e-07 2.621e-08 -6.498 8.23e-11 ***
## loan_amnt:gradeD 6.709e-08 2.798e-08 2.398 0.0165 *
## loan_amnt:gradeE 2.209e-07 3.016e-08 7.324 2.45e-13 ***
## loan_amnt:gradeF 2.780e-07 4.136e-08 6.722 1.82e-11 ***
## loan_amnt:gradeG 1.270e-07 7.260e-08 1.750 0.0801 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01052 on 37850 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9204
## F-statistic: 2.431e+04 on 18 and 37850 DF, p-value: < 2.2e-16
#Continuing with the previous model, instead of annual income as a continuous variable break it down into quartiles and use quartile dummy variables. You can do this with the following command.
lc_clean <- lc_clean %>%
mutate(quartiles_annual_inc = as.factor(ntile(annual_inc, 4)))
model4 <-lm(data = lc_clean, int_rate~loan_amnt*grade+term+dti+quartiles_annual_inc)
summary(model4)
##
## Call:
## lm(formula = int_rate ~ loan_amnt * grade + term + dti + quartiles_annual_inc,
## data = lc_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.119767 -0.007203 -0.000064 0.006596 0.037581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.187e-02 2.414e-04 297.700 < 2e-16 ***
## loan_amnt 1.615e-07 2.048e-08 7.889 3.13e-15 ***
## gradeB 3.622e-02 2.733e-04 132.524 < 2e-16 ***
## gradeC 6.196e-02 2.989e-04 207.272 < 2e-16 ***
## gradeD 8.079e-02 3.484e-04 231.877 < 2e-16 ***
## gradeE 9.632e-02 4.625e-04 208.273 < 2e-16 ***
## gradeF 1.143e-01 7.735e-04 147.723 < 2e-16 ***
## gradeG 1.327e-01 1.551e-03 85.570 < 2e-16 ***
## term60 3.781e-03 1.425e-04 26.532 < 2e-16 ***
## dti 3.672e-05 8.259e-06 4.446 8.78e-06 ***
## quartiles_annual_inc2 -2.619e-04 1.548e-04 -1.692 0.09068 .
## quartiles_annual_inc3 -3.992e-04 1.590e-04 -2.510 0.01208 *
## quartiles_annual_inc4 -5.382e-04 1.692e-04 -3.181 0.00147 **
## loan_amnt:gradeB -6.638e-08 2.441e-08 -2.719 0.00655 **
## loan_amnt:gradeC -1.699e-07 2.621e-08 -6.483 9.10e-11 ***
## loan_amnt:gradeD 6.770e-08 2.798e-08 2.419 0.01555 *
## loan_amnt:gradeE 2.203e-07 3.016e-08 7.304 2.85e-13 ***
## loan_amnt:gradeF 2.766e-07 4.136e-08 6.687 2.30e-11 ***
## loan_amnt:gradeG 1.267e-07 7.259e-08 1.746 0.08089 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01052 on 37850 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9204
## F-statistic: 2.432e+04 on 18 and 37850 DF, p-value: < 2.2e-16
#Compare the performance of these four models using the anova command
anova(model1, model2, model3, model4) # compare all
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.79e+04 | 4.22 | ||||
3.79e+04 | 4.19 | 6 | 0.0332 | 50 | 1.64e-61 |
3.78e+04 | 4.19 | 2 | 0.000107 | 0.484 | 0.616 |
3.78e+04 | 4.19 | 0 | 0.000915 |
anova(model1, model2) # compare each model individually
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.79e+04 | 4.22 | ||||
3.79e+04 | 4.19 | 6 | 0.0332 | 50 | 1.64e-61 |
anova(model2, model3)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.79e+04 | 4.19 | ||||
3.78e+04 | 4.19 | 2 | 0.000107 | 0.484 | 0.616 |
anova(model3, model4)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.78e+04 | 4.19 | ||||
3.78e+04 | 4.19 | 0 | 0.000915 |
anova(model2,model4)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.79e+04 | 4.19 | ||||
3.78e+04 | 4.19 | 2 | 0.00102 | 4.62 | 0.00983 |
# anova marks if there is a significant difference between any input model and the first model in the list.
Let us Consider the Following questions:
Which of the four models has the most explanatory power in sample?
In model 2, how should the estimated coefficient of the interaction term between grade B and loan amount, be interpreted?
The problem of multicollinearity describes situations in which one feature is correlated with other features (or with a linear combination of other features). If your goal is to use the model to make predictions, should you be concerned about multicollinearity? Why, or why not?
Model 4 has the highest explanatory power as its adjusted R value is the highest.
For a loan of grade B the loan each unit increase in loan ammount decreases the interest rate by 6.617e-8.
It is not a problem for prediction, it only influences the explanatory value of the model.
Out of sample testing
Let’s check the predictive accuracy of model2 by holding out a subset of the data to use as a testing data set. This method is sometimes referred to as the hold-out method for out-of-sample testing.
k-fold cross validation
We can also do out of sample testing using the method of k-fold cross validation. Using the caret package this is easy.
#the method "cv" stands for cross validation. We re going to create 10 folds.
control <- trainControl (
method="cv",
number=10,
verboseIter=F) #by setting this to true the model will report its progress after each estimation
#we are going to train the model and report the results using k-fold cross validation
plsFit<-train(
int_rate ~ loan_amnt + term+ dti + annual_inc + grade +grade:loan_amnt ,
lc_clean,
method = "lm",
trControl = control
)
summary(plsFit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.119807 -0.007230 -0.000057 0.006588 0.037460
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.171e-02 2.345e-04 305.762 < 2e-16 ***
## loan_amnt 1.528e-07 2.028e-08 7.537 4.91e-14 ***
## term60 3.793e-03 1.424e-04 26.636 < 2e-16 ***
## dti 3.836e-05 8.250e-06 4.649 3.34e-06 ***
## annual_inc -1.224e-09 9.249e-10 -1.324 0.18564
## gradeB 3.623e-02 2.732e-04 132.619 < 2e-16 ***
## gradeC 6.198e-02 2.988e-04 207.452 < 2e-16 ***
## gradeD 8.082e-02 3.483e-04 232.034 < 2e-16 ***
## gradeE 9.633e-02 4.625e-04 208.293 < 2e-16 ***
## gradeF 1.143e-01 7.735e-04 147.699 < 2e-16 ***
## gradeG 1.327e-01 1.551e-03 85.580 < 2e-16 ***
## `loan_amnt:gradeB` -6.617e-08 2.441e-08 -2.710 0.00673 **
## `loan_amnt:gradeC` -1.704e-07 2.621e-08 -6.500 8.14e-11 ***
## `loan_amnt:gradeD` 6.703e-08 2.798e-08 2.395 0.01662 *
## `loan_amnt:gradeE` 2.209e-07 3.016e-08 7.323 2.47e-13 ***
## `loan_amnt:gradeF` 2.779e-07 4.136e-08 6.720 1.85e-11 ***
## `loan_amnt:gradeG` 1.265e-07 7.260e-08 1.743 0.08140 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01052 on 37852 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9204
## F-statistic: 2.735e+04 on 16 and 37852 DF, p-value: < 2.2e-16
control <- trainControl (
method="cv",
number=5,
verboseIter=F) #by setting this to true the model will report its progress after each estimation
#we are going to train the model and report the results using k-fold cross validation
plsFit<-train(
int_rate ~ loan_amnt + term+ dti + annual_inc + grade +grade:loan_amnt ,
lc_clean,
method = "lm",
trControl = control
)
summary(plsFit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.119807 -0.007230 -0.000057 0.006588 0.037460
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.171e-02 2.345e-04 305.762 < 2e-16 ***
## loan_amnt 1.528e-07 2.028e-08 7.537 4.91e-14 ***
## term60 3.793e-03 1.424e-04 26.636 < 2e-16 ***
## dti 3.836e-05 8.250e-06 4.649 3.34e-06 ***
## annual_inc -1.224e-09 9.249e-10 -1.324 0.18564
## gradeB 3.623e-02 2.732e-04 132.619 < 2e-16 ***
## gradeC 6.198e-02 2.988e-04 207.452 < 2e-16 ***
## gradeD 8.082e-02 3.483e-04 232.034 < 2e-16 ***
## gradeE 9.633e-02 4.625e-04 208.293 < 2e-16 ***
## gradeF 1.143e-01 7.735e-04 147.699 < 2e-16 ***
## gradeG 1.327e-01 1.551e-03 85.580 < 2e-16 ***
## `loan_amnt:gradeB` -6.617e-08 2.441e-08 -2.710 0.00673 **
## `loan_amnt:gradeC` -1.704e-07 2.621e-08 -6.500 8.14e-11 ***
## `loan_amnt:gradeD` 6.703e-08 2.798e-08 2.395 0.01662 *
## `loan_amnt:gradeE` 2.209e-07 3.016e-08 7.323 2.47e-13 ***
## `loan_amnt:gradeF` 2.779e-07 4.136e-08 6.720 1.85e-11 ***
## `loan_amnt:gradeG` 1.265e-07 7.260e-08 1.743 0.08140 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01052 on 37852 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9204
## F-statistic: 2.735e+04 on 16 and 37852 DF, p-value: < 2.2e-16
control <- trainControl (
method="cv",
number=15,
verboseIter=F) #by setting this to true the model will report its progress after each estimation
#we are going to train the model and report the results using k-fold cross validation
plsFit<-train(
int_rate ~ loan_amnt + term+ dti + annual_inc + grade +grade:loan_amnt ,
lc_clean,
method = "lm",
trControl = control
)
summary(plsFit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.119807 -0.007230 -0.000057 0.006588 0.037460
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.171e-02 2.345e-04 305.762 < 2e-16 ***
## loan_amnt 1.528e-07 2.028e-08 7.537 4.91e-14 ***
## term60 3.793e-03 1.424e-04 26.636 < 2e-16 ***
## dti 3.836e-05 8.250e-06 4.649 3.34e-06 ***
## annual_inc -1.224e-09 9.249e-10 -1.324 0.18564
## gradeB 3.623e-02 2.732e-04 132.619 < 2e-16 ***
## gradeC 6.198e-02 2.988e-04 207.452 < 2e-16 ***
## gradeD 8.082e-02 3.483e-04 232.034 < 2e-16 ***
## gradeE 9.633e-02 4.625e-04 208.293 < 2e-16 ***
## gradeF 1.143e-01 7.735e-04 147.699 < 2e-16 ***
## gradeG 1.327e-01 1.551e-03 85.580 < 2e-16 ***
## `loan_amnt:gradeB` -6.617e-08 2.441e-08 -2.710 0.00673 **
## `loan_amnt:gradeC` -1.704e-07 2.621e-08 -6.500 8.14e-11 ***
## `loan_amnt:gradeD` 6.703e-08 2.798e-08 2.395 0.01662 *
## `loan_amnt:gradeE` 2.209e-07 3.016e-08 7.323 2.47e-13 ***
## `loan_amnt:gradeF` 2.779e-07 4.136e-08 6.720 1.85e-11 ***
## `loan_amnt:gradeG` 1.265e-07 7.260e-08 1.743 0.08140 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01052 on 37852 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9204
## F-statistic: 2.735e+04 on 16 and 37852 DF, p-value: < 2.2e-16
The cross-validation is likely to give a more robust model but in this case the rmse compared to the out of sample testing doesn’t seem to be very different. The k validation performs the the out of sample testing k times on the dataset but it is computationally more intensive.
Sample size estimation and learning curves
We can use the hold out method for out-of-sample testing to check if we have a sufficiently large sample to estimate the model reliably. The idea is to set aside some of the data as a testing set. From the remaining data draw progressively larger training sets and check how the performance of the model on the testing set changes. If the performance no longer improves with larger training sets we know we have a large enough sample. The code below does this. Examine it and run it with different random seeds.
#select a testing dataset (25% of all data)
set.seed(12)
train_test_split <- initial_split(lc_clean, prop = 0.75)
remaining <- training(train_test_split)
testing <- testing(train_test_split)
#We are now going to run 30 models starting from a tiny training set drawn from the training data and progressively increasing its size. The testing set remains the same in all iterations.
#initiating the model by setting some parameters to zero
rmse_sample <- 0
sample_size<-0
Rsq_sample<-0
for(i in 1:30) {
#from the remaining dataset select a smaller subset to training the data
set.seed(100)
sample
learning_split <- initial_split(remaining, prop = i/200)
training <- training(learning_split)
sample_size[i]=nrow(training)
#traing the model on the small dataset
model3<-lm(int_rate ~ loan_amnt + term+ dti + annual_inc + grade + grade:loan_amnt, training)
#test the performance of the model on the large testing dataset. This stays fixed for all iterations.
pred<-predict(model3,testing)
rmse_sample[i]<-RMSE(pred,testing$int_rate)
Rsq_sample[i]<-R2(pred,testing$int_rate)
}
## Warning in predict.lm(model3, testing): prediction from a rank-deficient fit may
## be misleading
## Warning in predict.lm(model3, testing): prediction from a rank-deficient fit may
## be misleading
plot(sample_size,rmse_sample)
plot(sample_size,Rsq_sample)
A sample size of 2000 is enough above which there isn’t significant improvement in the model. If we try to reduce prediction error further is to try automated feature selection or other ways to improve the model.
Regularization using LASSO regression
If we are in the region of the learning curve where we do not have enough data, one option is to use a regularization method such as LASSO.
Let’s try to estimate a large and complicated model (many interactions and polynomials) on a small training dataset using OLS regression and hold-out validation method.
#split the data in testing and training. The training test is really small.
set.seed(1234)
train_test_split <- initial_split(lc_clean, prop = 0.01)
training <- training(train_test_split)
testing <- testing(train_test_split)
model_lm<-lm(int_rate ~ poly(loan_amnt,3) + term+ dti + annual_inc + grade +grade:poly(loan_amnt,3):term +poly(loan_amnt,3):term +grade:term, training)
predictions <- predict(model_lm,testing)
## Warning in predict.lm(model_lm, testing): prediction from a rank-deficient fit
## may be misleading
# Model prediction performance
data.frame(
RMSE = RMSE(predictions, testing$int_rate),
Rsquare = R2(predictions, testing$int_rate)
)
RMSE | Rsquare |
---|---|
0.0123 | 0.891 |
Not surprisingly this model does not perform well – as we knew form the learning curves we constructed for a simpler model we need a lot more data to estimate this model reliably. Try running it again with different seeds. The model’s performance tends to be sensitive to the choice of the training set.
LASSO regression offers one solution – it extends the OLS regression by penalizing the model for setting any coefficient estimate to a value that is different from zero. The penalty is proportional to a parameter \(\lambda\). This parameter cannot be estimated directly (and for this reason sometimes it is referred to as hyperparameter). \(\lambda\) will be selected through k-fold cross validation so as to provide the best out-of-sample performance. As a result of the LASSO procedure, only those features that are more strongly associated with the outcome will have non-zero coefficient estimates and the estimated model will be less sensitive to the training set. Sometimes LASSO regression is referred to as regularization.
# we will look for the optimal lambda in this sequence (we will try 1000 different lambdas, feel free to try more if necessary)
set.seed(1234)
lambda_seq <- seq(0, 0.01, length = 1000)
# lasso regression using k-fold cross validation to select the best lambda
lasso <- train(
int_rate ~ poly(loan_amnt,3) + term+ dti + annual_inc + grade +grade:poly(loan_amnt,3):term +poly(loan_amnt,3):term +grade:term,
data = training,
method = "glmnet",
preProc = c("center", "scale"), #This option standardizes the data before running the LASSO regression
trControl = control,
tuneGrid = expand.grid(alpha = 1, lambda = lambda_seq) #alpha=1 specifies to run a LASSO regression. If alpha=0 the model would run ridge regression.
)
# Model coefficients
coef(lasso$finalModel, lasso$bestTune$lambda)
## 58 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.211648e-01
## poly(loan_amnt, 3)1 8.230186e-04
## poly(loan_amnt, 3)2 .
## poly(loan_amnt, 3)3 3.424361e-04
## term60 1.296476e-03
## dti 4.665829e-04
## annual_inc .
## gradeB 1.704497e-02
## gradeC 2.347740e-02
## gradeD 2.503032e-02
## gradeE 2.499361e-02
## gradeF 1.904264e-02
## gradeG 1.746929e-02
## poly(loan_amnt, 3)1:term60 .
## poly(loan_amnt, 3)2:term60 .
## poly(loan_amnt, 3)3:term60 .
## term60:gradeB -8.891932e-04
## term60:gradeC .
## term60:gradeD 5.765682e-04
## term60:gradeE 1.751918e-03
## term60:gradeF 6.890687e-04
## term60:gradeG 1.664257e-03
## poly(loan_amnt, 3)1:term36:gradeB .
## poly(loan_amnt, 3)2:term36:gradeB .
## poly(loan_amnt, 3)3:term36:gradeB 4.304657e-04
## poly(loan_amnt, 3)1:term60:gradeB .
## poly(loan_amnt, 3)2:term60:gradeB 4.984815e-04
## poly(loan_amnt, 3)3:term60:gradeB .
## poly(loan_amnt, 3)1:term36:gradeC .
## poly(loan_amnt, 3)2:term36:gradeC 3.714556e-04
## poly(loan_amnt, 3)3:term36:gradeC -3.335581e-04
## poly(loan_amnt, 3)1:term60:gradeC .
## poly(loan_amnt, 3)2:term60:gradeC .
## poly(loan_amnt, 3)3:term60:gradeC -4.161594e-04
## poly(loan_amnt, 3)1:term36:gradeD .
## poly(loan_amnt, 3)2:term36:gradeD 5.934940e-04
## poly(loan_amnt, 3)3:term36:gradeD 2.833198e-04
## poly(loan_amnt, 3)1:term60:gradeD .
## poly(loan_amnt, 3)2:term60:gradeD .
## poly(loan_amnt, 3)3:term60:gradeD -9.534448e-05
## poly(loan_amnt, 3)1:term36:gradeE -3.410535e-06
## poly(loan_amnt, 3)2:term36:gradeE .
## poly(loan_amnt, 3)3:term36:gradeE .
## poly(loan_amnt, 3)1:term60:gradeE .
## poly(loan_amnt, 3)2:term60:gradeE -2.266892e-06
## poly(loan_amnt, 3)3:term60:gradeE -5.207649e-04
## poly(loan_amnt, 3)1:term36:gradeF .
## poly(loan_amnt, 3)2:term36:gradeF .
## poly(loan_amnt, 3)3:term36:gradeF .
## poly(loan_amnt, 3)1:term60:gradeF -7.541316e-06
## poly(loan_amnt, 3)2:term60:gradeF .
## poly(loan_amnt, 3)3:term60:gradeF 1.395437e-03
## poly(loan_amnt, 3)1:term36:gradeG .
## poly(loan_amnt, 3)2:term36:gradeG .
## poly(loan_amnt, 3)3:term36:gradeG .
## poly(loan_amnt, 3)1:term60:gradeG .
## poly(loan_amnt, 3)2:term60:gradeG 4.918950e-04
## poly(loan_amnt, 3)3:term60:gradeG .
# Best lambda
lasso$bestTune$lambda
## [1] 0.0002902903
# Count of how many coefficients are greater than zero and how many are equal to zero
sum(coef(lasso$finalModel, lasso$bestTune$lambda)!=0)
## [1] 30
sum(coef(lasso$finalModel, lasso$bestTune$lambda)==0)
## [1] 28
# Make predictions
predictions <- predict(lasso,testing)
# Model prediction performance
data.frame(
RMSE = RMSE(predictions, testing$int_rate),
Rsquare = R2(predictions, testing$int_rate)
)
RMSE | Rsquare |
---|---|
0.0108 | 0.917 |
Let us consider the following quesitons:
Which model performs best out of sample, OLS regression or LASSO? Why?
What value of lambda offers best performance? Is this sensitive to the random seed? Why?
How many coefficients are zero and how many are non-zero in the LASSO model of best fit? Is number of zero (or non-zero) coefficients sensitive on the random seed? Why?
Why is it important to standardize continuous variables before running LASSO?
Lasso performs better because it better avoids overfitting the data then the OLS.
\(\lambda = 0.0003103103\) performs the best and this is sensitive to the random seed because the k-fold validation splits change based on the randomness, which then influence the model development to which the lambda is applied.
28 coefficients are equal to 0 30 coefficients are not 0 this is influenced by the random seed as well becasue lambda changes the with the random seed which changes the penalty on each coefficient.
To bring all variables to the same scale as they might be different units.
Using Time Information
Let’s try to further improve the model’s predictive performance. So far we have not used any time series information. Effectively, all things being equal, our prediction for the interest rate of a loan given in 2009 would be the same as that of a loan given in 2011. Is this a good assumption?
First, investigate graphically whether there are any time trends in the interest rates. (Note that the variable “issue_d” only has information on the month the loan was awarded but not the exact date.) Can you use this information to further improve the forecasting accuracy of your model? Try controlling for time in a linear fashion (i.e., a linear time trend) and controlling for time as quarter-year dummies (this is a method to capture non-linear effects of time – we assume that the impact of time doesn’t change within a quarter but it can chance from quarter to quarter). Finally, check if time affect loans of different grades differently.
#linear time trend (add code below)
lc_clean%>%
ggplot(aes(x= issue_d, y = int_rate))+
geom_point()+
geom_smooth()+
theme_bw()+
labs(title = "Interest Rates over time",
x = "date",
y = "interest rate")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#linear time trend by grade (add code below)
lc_clean%>%
ggplot(aes(x= issue_d, y = int_rate))+
geom_point()+
geom_smooth()+
theme_bw()+
labs(title = "Interest Rates over time by Grade",
x = "date",
y = "interest rate")+
facet_wrap(~grade)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Train models using OLS regression and k-fold cross-validation
#The first model has some explanatory variables and a linear time trend
time1<-train(
int_rate ~ loan_amnt + term+ dti + annual_inc + grade +grade:loan_amnt+ issue_d,#fill your variables here "+ issue_d"
lc_clean,
method = "lm",
trControl = control)
summary(time1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.119034 -0.006588 -0.000743 0.006949 0.036181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.504e-02 2.512e-03 -9.966 < 2e-16 ***
## loan_amnt 9.114e-08 1.995e-08 4.568 4.95e-06 ***
## term60 2.254e-03 1.452e-04 15.520 < 2e-16 ***
## dti 1.052e-05 8.123e-06 1.295 0.1953
## annual_inc -1.169e-09 9.072e-10 -1.289 0.1976
## gradeB 3.609e-02 2.680e-04 134.663 < 2e-16 ***
## gradeC 6.246e-02 2.933e-04 212.954 < 2e-16 ***
## gradeD 8.130e-02 3.418e-04 237.833 < 2e-16 ***
## gradeE 9.728e-02 4.543e-04 214.150 < 2e-16 ***
## gradeF 1.152e-01 7.591e-04 151.799 < 2e-16 ***
## gradeG 1.337e-01 1.521e-03 87.864 < 2e-16 ***
## issue_d 6.538e-06 1.691e-07 38.673 < 2e-16 ***
## `loan_amnt:gradeB` 8.745e-11 2.401e-08 0.004 0.9971
## `loan_amnt:gradeC` -1.205e-07 2.574e-08 -4.681 2.86e-06 ***
## `loan_amnt:gradeD` 1.143e-07 2.747e-08 4.160 3.19e-05 ***
## `loan_amnt:gradeE` 2.464e-07 2.959e-08 8.328 < 2e-16 ***
## `loan_amnt:gradeF` 3.018e-07 4.057e-08 7.438 1.04e-13 ***
## `loan_amnt:gradeG` 1.594e-07 7.121e-08 2.239 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01031 on 37851 degrees of freedom
## Multiple R-squared: 0.9234, Adjusted R-squared: 0.9234
## F-statistic: 2.685e+04 on 17 and 37851 DF, p-value: < 2.2e-16
#The second model has a different linear time trend for each grade class
time2<-train(
int_rate ~ loan_amnt + term+ dti + annual_inc+grade:loan_amnt+ issue_d*grade, #fill your variables here
lc_clean,
method = "lm",
trControl = control
)
summary(time2)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.120471 -0.006615 -0.000147 0.006766 0.030872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.860e-01 4.357e-03 65.639 < 2e-16 ***
## loan_amnt 3.063e-07 1.769e-08 17.314 < 2e-16 ***
## term60 -6.187e-04 1.303e-04 -4.749 2.05e-06 ***
## dti 5.591e-05 7.140e-06 7.830 5.00e-15 ***
## annual_inc -1.292e-09 7.960e-10 -1.623 0.105
## issue_d -1.443e-05 2.935e-07 -49.173 < 2e-16 ***
## gradeB -2.297e-01 5.808e-03 -39.557 < 2e-16 ***
## gradeC -3.539e-01 6.179e-03 -57.279 < 2e-16 ***
## gradeD -5.145e-01 7.198e-03 -71.482 < 2e-16 ***
## gradeE -6.134e-01 9.905e-03 -61.931 < 2e-16 ***
## gradeF -7.103e-01 1.612e-02 -44.068 < 2e-16 ***
## gradeG -7.108e-01 3.314e-02 -21.447 < 2e-16 ***
## `loan_amnt:gradeB` -1.557e-07 2.123e-08 -7.337 2.23e-13 ***
## `loan_amnt:gradeC` -3.235e-07 2.283e-08 -14.169 < 2e-16 ***
## `loan_amnt:gradeD` -1.778e-07 2.435e-08 -7.302 2.90e-13 ***
## `loan_amnt:gradeE` -1.719e-07 2.659e-08 -6.467 1.02e-10 ***
## `loan_amnt:gradeF` -1.854e-07 3.678e-08 -5.043 4.61e-07 ***
## `loan_amnt:gradeG` -2.950e-07 6.440e-08 -4.581 4.65e-06 ***
## `gradeB:issue_d` 1.790e-05 3.903e-07 45.864 < 2e-16 ***
## `gradeC:issue_d` 2.806e-05 4.165e-07 67.370 < 2e-16 ***
## `gradeD:issue_d` 4.018e-05 4.850e-07 82.858 < 2e-16 ***
## `gradeE:issue_d` 4.799e-05 6.680e-07 71.848 < 2e-16 ***
## `gradeF:issue_d` 5.571e-05 1.085e-06 51.333 < 2e-16 ***
## `gradeG:issue_d` 5.699e-05 2.230e-06 25.556 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.009049 on 37845 degrees of freedom
## Multiple R-squared: 0.9411, Adjusted R-squared: 0.941
## F-statistic: 2.628e+04 on 23 and 37845 DF, p-value: < 2.2e-16
#Change the time trend to a quarter dummy variables.
#zoo::as.yearqrt() creates quarter dummies
lc_clean_quarter<-lc_clean %>%
mutate(yq = as.factor(as.yearqtr(lc_clean$issue_d, format = "%Y-%m-%d")))
time3<-train(
int_rate ~ loan_amnt + term+ dti + annual_inc+grade*loan_amnt+ yq,#fill your variables here
lc_clean_quarter,
method = "lm",
trControl = control
)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
summary(time3)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.115929 -0.005410 -0.000245 0.005796 0.037593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.827e-02 9.076e-03 4.217 2.48e-05 ***
## loan_amnt 1.040e-07 1.760e-08 5.908 3.48e-09 ***
## term60 4.582e-03 1.309e-04 34.990 < 2e-16 ***
## dti 6.068e-06 7.147e-06 0.849 0.395841
## annual_inc 1.346e-10 7.983e-10 0.169 0.866063
## gradeB 3.659e-02 2.359e-04 155.122 < 2e-16 ***
## gradeC 6.250e-02 2.583e-04 241.949 < 2e-16 ***
## gradeD 8.098e-02 3.010e-04 269.059 < 2e-16 ***
## gradeE 9.700e-02 3.999e-04 242.524 < 2e-16 ***
## gradeF 1.154e-01 6.679e-04 172.731 < 2e-16 ***
## gradeG 1.338e-01 1.338e-03 99.996 < 2e-16 ***
## `yq2007 Q3` 1.911e-02 9.136e-03 2.091 0.036506 *
## `yq2007 Q4` 1.469e-02 9.103e-03 1.614 0.106601
## `yq2008 Q1` 2.114e-02 9.082e-03 2.327 0.019956 *
## `yq2008 Q2` 2.275e-02 9.090e-03 2.502 0.012339 *
## `yq2008 Q3` 2.390e-02 9.100e-03 2.627 0.008623 **
## `yq2008 Q4` 3.182e-02 9.083e-03 3.504 0.000459 ***
## `yq2009 Q1` 3.601e-02 9.080e-03 3.966 7.32e-05 ***
## `yq2009 Q2` 3.786e-02 9.078e-03 4.170 3.05e-05 ***
## `yq2009 Q3` 3.939e-02 9.077e-03 4.339 1.43e-05 ***
## `yq2009 Q4` 3.964e-02 9.076e-03 4.368 1.26e-05 ***
## `yq2010 Q1` 3.462e-02 9.076e-03 3.815 0.000136 ***
## `yq2010 Q2` 3.337e-02 9.075e-03 3.677 0.000236 ***
## `yq2010 Q3` 3.446e-02 9.075e-03 3.798 0.000146 ***
## `yq2010 Q4` 2.574e-02 9.075e-03 2.837 0.004562 **
## `yq2011 Q1` 2.787e-02 9.075e-03 3.071 0.002133 **
## `yq2011 Q2` 3.304e-02 9.075e-03 3.641 0.000272 ***
## `yq2011 Q3` 3.543e-02 9.075e-03 3.905 9.45e-05 ***
## `yq2011 Q4` 4.167e-02 9.074e-03 4.592 4.40e-06 ***
## `loan_amnt:gradeB` -1.079e-07 2.117e-08 -5.099 3.44e-07 ***
## `loan_amnt:gradeC` -2.145e-07 2.271e-08 -9.446 < 2e-16 ***
## `loan_amnt:gradeD` 5.835e-08 2.422e-08 2.409 0.016004 *
## `loan_amnt:gradeE` 1.816e-07 2.609e-08 6.960 3.46e-12 ***
## `loan_amnt:gradeF` 2.265e-07 3.573e-08 6.341 2.31e-10 ***
## `loan_amnt:gradeG` 1.123e-07 6.265e-08 1.793 0.073043 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00907 on 37834 degrees of freedom
## Multiple R-squared: 0.9408, Adjusted R-squared: 0.9408
## F-statistic: 1.769e+04 on 34 and 37834 DF, p-value: < 2.2e-16
#We specify one quarter dummy variable for each grade. This is going to be a large model as there are 19 quarters x 7 grades = 133 quarter-grade dummies.
time4<-train(
int_rate ~ loan_amnt + term+ dti + annual_inc+grade*loan_amnt+ yq*grade ,#fill your variables here
lc_clean_quarter,
method = "lm",
trControl = control
)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
summary(time4)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.120433 -0.004928 0.000306 0.004675 0.034227
##
## Coefficients: (11 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.860e-02 7.579e-03 2.454 0.014115 *
## loan_amnt 2.973e-07 1.496e-08 19.878 < 2e-16 ***
## term60 1.728e-03 1.130e-04 15.300 < 2e-16 ***
## dti 4.826e-05 5.984e-06 8.065 7.53e-16 ***
## annual_inc -3.047e-11 6.666e-10 -0.046 0.963547
## gradeB 4.307e-02 3.192e-04 134.925 < 2e-16 ***
## gradeC 6.975e-02 3.622e-04 192.580 < 2e-16 ***
## gradeD 9.703e-02 4.189e-04 231.655 < 2e-16 ***
## gradeE 1.177e-01 5.380e-04 218.707 < 2e-16 ***
## gradeF 1.390e-01 8.638e-04 160.903 < 2e-16 ***
## gradeG 1.603e-01 1.800e-03 89.024 < 2e-16 ***
## `yq2007 Q3` 5.476e-02 7.722e-03 7.092 1.35e-12 ***
## `yq2007 Q4` 5.579e-02 7.711e-03 7.235 4.74e-13 ***
## `yq2008 Q1` 5.946e-02 7.617e-03 7.807 6.01e-15 ***
## `yq2008 Q2` 6.088e-02 7.651e-03 7.957 1.81e-15 ***
## `yq2008 Q3` 6.027e-02 7.688e-03 7.839 4.65e-15 ***
## `yq2008 Q4` 6.664e-02 7.616e-03 8.749 < 2e-16 ***
## `yq2009 Q1` 6.834e-02 7.599e-03 8.994 < 2e-16 ***
## `yq2009 Q2` 6.780e-02 7.593e-03 8.930 < 2e-16 ***
## `yq2009 Q3` 6.597e-02 7.589e-03 8.693 < 2e-16 ***
## `yq2009 Q4` 6.397e-02 7.587e-03 8.432 < 2e-16 ***
## `yq2010 Q1` 5.696e-02 7.586e-03 7.509 6.10e-14 ***
## `yq2010 Q2` 5.488e-02 7.584e-03 7.237 4.67e-13 ***
## `yq2010 Q3` 5.485e-02 7.583e-03 7.233 4.82e-13 ***
## `yq2010 Q4` 4.379e-02 7.581e-03 5.776 7.70e-09 ***
## `yq2011 Q1` 4.567e-02 7.581e-03 6.025 1.71e-09 ***
## `yq2011 Q2` 4.656e-02 7.580e-03 6.142 8.25e-10 ***
## `yq2011 Q3` 4.674e-02 7.579e-03 6.167 7.05e-10 ***
## `yq2011 Q4` 5.374e-02 7.575e-03 7.094 1.32e-12 ***
## `loan_amnt:gradeB` -2.454e-07 1.794e-08 -13.678 < 2e-16 ***
## `loan_amnt:gradeC` -3.708e-07 1.928e-08 -19.237 < 2e-16 ***
## `loan_amnt:gradeD` -2.123e-07 2.056e-08 -10.326 < 2e-16 ***
## `loan_amnt:gradeE` -2.249e-07 2.255e-08 -9.973 < 2e-16 ***
## `loan_amnt:gradeF` -2.491e-07 3.154e-08 -7.899 2.89e-15 ***
## `loan_amnt:gradeG` -3.430e-07 5.653e-08 -6.067 1.31e-09 ***
## `gradeB:yq2007 Q3` -2.307e-02 2.299e-03 -10.034 < 2e-16 ***
## `gradeC:yq2007 Q3` -3.555e-02 2.423e-03 -14.669 < 2e-16 ***
## `gradeD:yq2007 Q3` -5.148e-02 3.711e-03 -13.874 < 2e-16 ***
## `gradeE:yq2007 Q3` -5.687e-02 5.566e-03 -10.218 < 2e-16 ***
## `gradeF:yq2007 Q3` -6.133e-02 3.760e-03 -16.313 < 2e-16 ***
## `gradeG:yq2007 Q3` NA NA NA NA
## `gradeB:yq2007 Q4` -2.403e-02 1.948e-03 -12.336 < 2e-16 ***
## `gradeC:yq2007 Q4` -3.565e-02 1.799e-03 -19.820 < 2e-16 ***
## `gradeD:yq2007 Q4` -5.005e-02 2.070e-03 -24.183 < 2e-16 ***
## `gradeE:yq2007 Q4` -5.085e-02 2.723e-03 -18.676 < 2e-16 ***
## `gradeF:yq2007 Q4` -5.441e-02 7.720e-03 -7.048 1.85e-12 ***
## `gradeG:yq2007 Q4` NA NA NA NA
## `gradeB:yq2008 Q1` -2.294e-02 9.838e-04 -23.314 < 2e-16 ***
## `gradeC:yq2008 Q1` -3.340e-02 1.049e-03 -31.837 < 2e-16 ***
## `gradeD:yq2008 Q1` -4.555e-02 1.178e-03 -38.682 < 2e-16 ***
## `gradeE:yq2008 Q1` -5.330e-02 1.655e-03 -32.212 < 2e-16 ***
## `gradeF:yq2008 Q1` -5.639e-02 2.844e-03 -19.826 < 2e-16 ***
## `gradeG:yq2008 Q1` -6.322e-02 5.523e-03 -11.447 < 2e-16 ***
## `gradeB:yq2008 Q2` -2.276e-02 1.363e-03 -16.699 < 2e-16 ***
## `gradeC:yq2008 Q2` -3.375e-02 1.393e-03 -24.237 < 2e-16 ***
## `gradeD:yq2008 Q2` -4.730e-02 1.647e-03 -28.722 < 2e-16 ***
## `gradeE:yq2008 Q2` -5.147e-02 2.910e-03 -17.689 < 2e-16 ***
## `gradeF:yq2008 Q2` -5.828e-02 3.593e-03 -16.222 < 2e-16 ***
## `gradeG:yq2008 Q2` NA NA NA NA
## `gradeB:yq2008 Q3` -2.025e-02 1.740e-03 -11.642 < 2e-16 ***
## `gradeC:yq2008 Q3` -3.116e-02 1.668e-03 -18.674 < 2e-16 ***
## `gradeD:yq2008 Q3` -4.498e-02 2.101e-03 -21.410 < 2e-16 ***
## `gradeE:yq2008 Q3` -4.674e-02 3.008e-03 -15.539 < 2e-16 ***
## `gradeF:yq2008 Q3` -5.667e-02 4.072e-03 -13.918 < 2e-16 ***
## `gradeG:yq2008 Q3` NA NA NA NA
## `gradeB:yq2008 Q4` -1.846e-02 1.018e-03 -18.130 < 2e-16 ***
## `gradeC:yq2008 Q4` -3.049e-02 1.053e-03 -28.959 < 2e-16 ***
## `gradeD:yq2008 Q4` -4.384e-02 1.289e-03 -34.017 < 2e-16 ***
## `gradeE:yq2008 Q4` -4.912e-02 1.676e-03 -29.303 < 2e-16 ***
## `gradeF:yq2008 Q4` -5.311e-02 3.908e-03 -13.590 < 2e-16 ***
## `gradeG:yq2008 Q4` -5.530e-02 5.536e-03 -9.990 < 2e-16 ***
## `gradeB:yq2009 Q1` -1.309e-02 9.322e-04 -14.042 < 2e-16 ***
## `gradeC:yq2009 Q1` -2.475e-02 8.185e-04 -30.242 < 2e-16 ***
## `gradeD:yq2009 Q1` -4.047e-02 9.187e-04 -44.057 < 2e-16 ***
## `gradeE:yq2009 Q1` -4.418e-02 1.249e-03 -35.359 < 2e-16 ***
## `gradeF:yq2009 Q1` -4.824e-02 2.811e-03 -17.163 < 2e-16 ***
## `gradeG:yq2009 Q1` NA NA NA NA
## `gradeB:yq2009 Q2` -1.349e-02 7.244e-04 -18.629 < 2e-16 ***
## `gradeC:yq2009 Q2` -2.423e-02 7.533e-04 -32.163 < 2e-16 ***
## `gradeD:yq2009 Q2` -4.046e-02 9.255e-04 -43.716 < 2e-16 ***
## `gradeE:yq2009 Q2` -4.477e-02 1.311e-03 -34.143 < 2e-16 ***
## `gradeF:yq2009 Q2` -5.317e-02 2.409e-03 -22.068 < 2e-16 ***
## `gradeG:yq2009 Q2` -5.704e-02 4.104e-03 -13.898 < 2e-16 ***
## `gradeB:yq2009 Q3` -1.142e-02 6.336e-04 -18.025 < 2e-16 ***
## `gradeC:yq2009 Q3` -2.040e-02 6.928e-04 -29.441 < 2e-16 ***
## `gradeD:yq2009 Q3` -3.290e-02 8.752e-04 -37.596 < 2e-16 ***
## `gradeE:yq2009 Q3` -3.720e-02 1.131e-03 -32.879 < 2e-16 ***
## `gradeF:yq2009 Q3` -4.274e-02 1.862e-03 -22.954 < 2e-16 ***
## `gradeG:yq2009 Q3` -4.755e-02 3.162e-03 -15.037 < 2e-16 ***
## `gradeB:yq2009 Q4` -8.485e-03 5.660e-04 -14.990 < 2e-16 ***
## `gradeC:yq2009 Q4` -1.674e-02 6.162e-04 -27.159 < 2e-16 ***
## `gradeD:yq2009 Q4` -2.821e-02 7.050e-04 -40.021 < 2e-16 ***
## `gradeE:yq2009 Q4` -3.367e-02 1.108e-03 -30.398 < 2e-16 ***
## `gradeF:yq2009 Q4` -3.638e-02 1.718e-03 -21.182 < 2e-16 ***
## `gradeG:yq2009 Q4` -3.769e-02 2.556e-03 -14.743 < 2e-16 ***
## `gradeB:yq2010 Q1` -1.069e-02 5.403e-04 -19.792 < 2e-16 ***
## `gradeC:yq2010 Q1` -1.152e-02 6.007e-04 -19.181 < 2e-16 ***
## `gradeD:yq2010 Q1` -2.162e-02 6.842e-04 -31.604 < 2e-16 ***
## `gradeE:yq2010 Q1` -2.495e-02 1.015e-03 -24.572 < 2e-16 ***
## `gradeF:yq2010 Q1` -3.069e-02 1.794e-03 -17.106 < 2e-16 ***
## `gradeG:yq2010 Q1` -2.975e-02 3.106e-03 -9.579 < 2e-16 ***
## `gradeB:yq2010 Q2` -9.918e-03 4.801e-04 -20.657 < 2e-16 ***
## `gradeC:yq2010 Q2` -8.566e-03 5.269e-04 -16.258 < 2e-16 ***
## `gradeD:yq2010 Q2` -1.868e-02 5.939e-04 -31.449 < 2e-16 ***
## `gradeE:yq2010 Q2` -2.297e-02 7.790e-04 -29.481 < 2e-16 ***
## `gradeF:yq2010 Q2` -2.840e-02 1.314e-03 -21.614 < 2e-16 ***
## `gradeG:yq2010 Q2` -2.822e-02 2.662e-03 -10.603 < 2e-16 ***
## `gradeB:yq2010 Q3` -7.449e-03 4.609e-04 -16.161 < 2e-16 ***
## `gradeC:yq2010 Q3` -5.011e-03 5.056e-04 -9.911 < 2e-16 ***
## `gradeD:yq2010 Q3` -1.704e-02 5.669e-04 -30.064 < 2e-16 ***
## `gradeE:yq2010 Q3` -2.372e-02 6.796e-04 -34.899 < 2e-16 ***
## `gradeF:yq2010 Q3` -2.737e-02 1.094e-03 -25.010 < 2e-16 ***
## `gradeG:yq2010 Q3` -2.538e-02 1.880e-03 -13.497 < 2e-16 ***
## `gradeB:yq2010 Q4` -7.633e-03 4.269e-04 -17.882 < 2e-16 ***
## `gradeC:yq2010 Q4` -9.645e-04 4.811e-04 -2.005 0.044981 *
## `gradeD:yq2010 Q4` -1.264e-02 5.558e-04 -22.749 < 2e-16 ***
## `gradeE:yq2010 Q4` -1.654e-02 6.724e-04 -24.594 < 2e-16 ***
## `gradeF:yq2010 Q4` -1.944e-02 1.055e-03 -18.425 < 2e-16 ***
## `gradeG:yq2010 Q4` -1.931e-02 1.690e-03 -11.423 < 2e-16 ***
## `gradeB:yq2011 Q1` -5.832e-03 4.113e-04 -14.181 < 2e-16 ***
## `gradeC:yq2011 Q1` -2.389e-03 4.731e-04 -5.050 4.45e-07 ***
## `gradeD:yq2011 Q1` -1.071e-02 5.270e-04 -20.323 < 2e-16 ***
## `gradeE:yq2011 Q1` -1.574e-02 6.063e-04 -25.963 < 2e-16 ***
## `gradeF:yq2011 Q1` -1.816e-02 8.536e-04 -21.279 < 2e-16 ***
## `gradeG:yq2011 Q1` -2.124e-02 1.555e-03 -13.657 < 2e-16 ***
## `gradeB:yq2011 Q2` -1.325e-03 3.914e-04 -3.384 0.000714 ***
## `gradeC:yq2011 Q2` 3.939e-04 4.420e-04 0.891 0.372828
## `gradeD:yq2011 Q2` -4.201e-03 4.945e-04 -8.497 < 2e-16 ***
## `gradeE:yq2011 Q2` -5.544e-03 5.849e-04 -9.479 < 2e-16 ***
## `gradeF:yq2011 Q2` -5.154e-03 8.427e-04 -6.117 9.65e-10 ***
## `gradeG:yq2011 Q2` -1.097e-02 1.592e-03 -6.891 5.64e-12 ***
## `gradeB:yq2011 Q3` 1.314e-03 3.653e-04 3.597 0.000322 ***
## `gradeC:yq2011 Q3` 2.389e-03 4.221e-04 5.659 1.54e-08 ***
## `gradeD:yq2011 Q3` -3.512e-04 4.749e-04 -0.740 0.459599
## `gradeE:yq2011 Q3` 1.606e-03 5.763e-04 2.787 0.005326 **
## `gradeF:yq2011 Q3` 9.302e-04 8.587e-04 1.083 0.278699
## `gradeG:yq2011 Q3` -1.695e-03 1.643e-03 -1.032 0.302205
## `gradeB:yq2011 Q4` NA NA NA NA
## `gradeC:yq2011 Q4` NA NA NA NA
## `gradeD:yq2011 Q4` NA NA NA NA
## `gradeE:yq2011 Q4` NA NA NA NA
## `gradeF:yq2011 Q4` NA NA NA NA
## `gradeG:yq2011 Q4` NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007564 on 37737 degrees of freedom
## Multiple R-squared: 0.9589, Adjusted R-squared: 0.9588
## F-statistic: 6728 on 131 and 37737 DF, p-value: < 2.2e-16
data.frame(
time1$results$RMSE,
time2$results$RMSE,
time3$results$RMSE,
time4$results$RMSE)
time1.results.RMSE | time2.results.RMSE | time3.results.RMSE | time4.results.RMSE |
---|---|---|---|
0.0103 | 0.00905 | 0.00908 | 0.00757 |
Interest rate changes over time, but it changes at a different rates based on the grade of the loan. This means each step in the above exercise improves the prediction, with the best model being produced using year quarters in relation to loan grade.
Using Bond Yields
One concern with using time trends for forecasting is that in order to make predictions for future loans we will need to project trends to the future. This is an extrapolation that may not be reasonable, especially if macroeconomic conditions in the future change. Furthermore, if we are using quarter-year dummies, it is not even possible to estimate the coefficient of these dummy variables for future quarters.
Instead, perhaps it’s better to find the reasons as to why different periods are different from one another. The csv file “MonthBondYields.csv” contains information on the yield of US Treasuries on the first day of each month. Can you use it to see if you can improve your predictions without using time dummies?
#load the data to memory as a dataframe
bond_prices<-readr::read_csv(here::here("csv","MonthBondYields.csv"))
## New names:
## * `` -> ...7
## * `` -> ...8
## * `` -> ...9
## * `` -> ...10
## Rows: 54 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Date, Change %
## dbl (4): Price, Open, High, Low
## lgl (4): ...7, ...8, ...9, ...10
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#make the date of the bond file comparable to the lending club dataset
#for some regional date/number (locale) settings this may not work. If it does try running the following line of code in the Console
#Sys.setlocale("LC_TIME","English")
bond_prices <- bond_prices %>%
mutate(Date2=as.Date(paste("01",Date,sep="-"),"%d-%b-%y")) %>%
select(-starts_with("X"))
#let's see what happened to bond yields over time. Lower bond yields mean the cost of borrowing has gone down.
bond_prices %>%
ggplot(aes(x=Date2, y=Price))+geom_point(size=0.1, alpha=0.5)
library(janitor)
bond_prices<-bond_prices%>%
clean_names()
bond_prices<-bond_prices%>%
mutate(change = as.numeric(sub("%","",bond_prices$change_percent))/100)
#join the data using a left join
lc_with_bonds<-lc_clean %>%
left_join(bond_prices, by = c("issue_d" = "date2")) %>%
arrange(issue_d) %>%
filter(!is.na(price))
# investigate graphically if there is a relationship
lc_with_bonds%>%
ggplot(aes(x=int_rate, y= price))+
geom_point(size=0.1, alpha=0.5)+geom_smooth(method="lm")+
labs(title = "Correlation of Interest Rate and Bond Prices",
x = "Interest Rate",
y = "Bond Price")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
lc_with_bonds%>%
ggplot(aes(x=int_rate, y=price, color=grade))+
geom_point(size=0.1, alpha=0.5)+geom_smooth(method="lm")+
labs(title = "Correlation of Interest Rate and Bond Prices",
subtitle = "Grouped By Grade",
x = "Interest Rate",
y = "Bond Price")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
#let's train a model using the bond information
plsFit<-train(
int_rate ~ loan_amnt + term+ dti + annual_inc+grade*loan_amnt+grade*change , #fill your variables here
lc_with_bonds,
method = "lm",
trControl = control
)
summary(plsFit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.117403 -0.006690 -0.000647 0.006705 0.039457
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.180e-02 2.332e-04 307.953 < 2e-16 ***
## loan_amnt 1.518e-07 2.007e-08 7.563 4.03e-14 ***
## term60 3.552e-03 1.412e-04 25.147 < 2e-16 ***
## dti 3.799e-05 8.167e-06 4.651 3.31e-06 ***
## annual_inc -9.974e-10 9.156e-10 -1.089 0.27603
## gradeB 3.592e-02 2.724e-04 131.892 < 2e-16 ***
## gradeC 6.175e-02 2.972e-04 207.788 < 2e-16 ***
## gradeD 8.019e-02 3.471e-04 231.021 < 2e-16 ***
## gradeE 9.589e-02 4.597e-04 208.616 < 2e-16 ***
## gradeF 1.137e-01 7.680e-04 148.084 < 2e-16 ***
## gradeG 1.317e-01 1.547e-03 85.125 < 2e-16 ***
## change 4.372e-03 1.069e-03 4.092 4.29e-05 ***
## `loan_amnt:gradeB` -6.377e-08 2.417e-08 -2.639 0.00833 **
## `loan_amnt:gradeC` -1.716e-07 2.596e-08 -6.609 3.93e-11 ***
## `loan_amnt:gradeD` 6.439e-08 2.770e-08 2.325 0.02010 *
## `loan_amnt:gradeE` 1.982e-07 2.991e-08 6.628 3.44e-11 ***
## `loan_amnt:gradeF` 2.559e-07 4.101e-08 6.240 4.43e-10 ***
## `loan_amnt:gradeG` 1.496e-07 7.196e-08 2.079 0.03764 *
## `gradeB:change` -1.417e-02 1.474e-03 -9.614 < 2e-16 ***
## `gradeC:change` -1.410e-02 1.621e-03 -8.700 < 2e-16 ***
## `gradeD:change` -2.933e-02 1.838e-03 -15.963 < 2e-16 ***
## `gradeE:change` -3.473e-02 2.395e-03 -14.503 < 2e-16 ***
## `gradeF:change` -4.051e-02 3.732e-03 -10.855 < 2e-16 ***
## `gradeG:change` -3.810e-02 6.262e-03 -6.085 1.18e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01041 on 37844 degrees of freedom
## Multiple R-squared: 0.922, Adjusted R-squared: 0.922
## F-statistic: 1.946e+04 on 23 and 37844 DF, p-value: < 2.2e-16
Do bond yields have any explanatory power?
The bond yields have explenatory power, however they do not improve the prediction to the same level as time dummies. The bond yields are correlated with interest rate to a reasonable level this allows the bond price to act in a similar way the time data does within the model, but less precisly, however bond prices can be extrapolated to the future so it is a better data to use if the model needs to predict future interest rates.
Further investigating model options
#the method "cv" stands for cross validation. We re going to create 10 folds.
control <- trainControl (
method="cv",
number=10,
verboseIter=F) #by setting this to true the model will report its progress after each estimation
#we are going to train the model and report the results using k-fold cross validation
impr_model1<-train(
int_rate ~ installment*poly(loan_amnt, 3) + annual_inc * term + dti + grade*price,
lc_with_bonds,
method = "lm",
trControl = control
)
summary(impr_model1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.094964 -0.005019 0.000036 0.005670 0.039856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.924e-02 1.466e-03 -40.396 < 2e-16 ***
## installment 5.165e-04 5.585e-06 92.471 < 2e-16 ***
## `poly(loan_amnt, 3)1` -2.768e+00 6.644e-02 -41.667 < 2e-16 ***
## `poly(loan_amnt, 3)2` 3.435e+00 6.194e-02 55.450 < 2e-16 ***
## `poly(loan_amnt, 3)3` -4.324e+00 5.866e-02 -73.702 < 2e-16 ***
## annual_inc -3.715e-09 8.417e-10 -4.413 1.02e-05 ***
## term60 4.811e-02 5.397e-04 89.145 < 2e-16 ***
## dti 2.152e-05 6.813e-06 3.159 0.00158 **
## gradeB 4.227e-02 5.857e-04 72.173 < 2e-16 ***
## gradeC 6.608e-02 6.715e-04 98.415 < 2e-16 ***
## gradeD 9.378e-02 7.923e-04 118.370 < 2e-16 ***
## gradeE 1.137e-01 1.001e-03 113.637 < 2e-16 ***
## gradeF 1.291e-01 1.453e-03 88.871 < 2e-16 ***
## gradeG 1.423e-01 2.634e-03 54.027 < 2e-16 ***
## price 7.122e-04 1.455e-04 4.895 9.86e-07 ***
## `installment:poly(loan_amnt, 3)1` -4.157e-02 4.786e-04 -86.859 < 2e-16 ***
## `installment:poly(loan_amnt, 3)2` 1.435e-02 2.013e-04 71.275 < 2e-16 ***
## `installment:poly(loan_amnt, 3)3` 3.437e-04 3.614e-05 9.512 < 2e-16 ***
## `annual_inc:term60` 1.219e-08 1.925e-09 6.334 2.42e-10 ***
## `gradeB:price` -4.456e-03 1.966e-04 -22.663 < 2e-16 ***
## `gradeC:price` -5.890e-03 2.186e-04 -26.940 < 2e-16 ***
## `gradeD:price` -9.538e-03 2.538e-04 -37.582 < 2e-16 ***
## `gradeE:price` -1.180e-02 3.249e-04 -36.321 < 2e-16 ***
## `gradeF:price` -1.189e-02 4.794e-04 -24.814 < 2e-16 ***
## `gradeG:price` -1.216e-02 8.845e-04 -13.749 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008668 on 37843 degrees of freedom
## Multiple R-squared: 0.9459, Adjusted R-squared: 0.9459
## F-statistic: 2.758e+04 on 24 and 37843 DF, p-value: < 2.2e-16
impr_model2<-train(
int_rate ~ installment*poly(loan_amnt, 3) + annual_inc * term + dti + grade*price + addr_state,
lc_with_bonds,
method = "lm",
trControl = control
)
summary(impr_model2)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.095024 -0.005002 0.000035 0.005671 0.039932
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.979e-02 1.521e-03 -39.304 < 2e-16 ***
## installment 5.165e-04 5.585e-06 92.476 < 2e-16 ***
## `poly(loan_amnt, 3)1` -2.767e+00 6.644e-02 -41.642 < 2e-16 ***
## `poly(loan_amnt, 3)2` 3.432e+00 6.195e-02 55.399 < 2e-16 ***
## `poly(loan_amnt, 3)3` -4.327e+00 5.866e-02 -73.762 < 2e-16 ***
## annual_inc -3.731e-09 8.424e-10 -4.429 9.50e-06 ***
## term60 4.812e-02 5.397e-04 89.164 < 2e-16 ***
## dti 2.210e-05 6.838e-06 3.232 0.00123 **
## gradeB 4.224e-02 5.857e-04 72.118 < 2e-16 ***
## gradeC 6.604e-02 6.714e-04 98.352 < 2e-16 ***
## gradeD 9.372e-02 7.924e-04 118.280 < 2e-16 ***
## gradeE 1.137e-01 1.001e-03 113.557 < 2e-16 ***
## gradeF 1.290e-01 1.453e-03 88.750 < 2e-16 ***
## gradeG 1.421e-01 2.634e-03 53.954 < 2e-16 ***
## price 7.001e-04 1.455e-04 4.811 1.51e-06 ***
## addr_stateAZ 8.500e-04 5.014e-04 1.695 0.09006 .
## addr_stateCA 5.396e-04 4.206e-04 1.283 0.19959
## addr_stateCO 1.874e-04 5.109e-04 0.367 0.71374
## addr_stateCT 2.991e-04 5.157e-04 0.580 0.56199
## addr_stateFL 6.627e-04 4.386e-04 1.511 0.13083
## addr_stateGA 2.712e-04 4.689e-04 0.578 0.56310
## addr_stateIL 7.898e-04 4.643e-04 1.701 0.08892 .
## addr_stateKS -9.566e-05 6.651e-04 -0.144 0.88564
## addr_stateKY 4.171e-05 6.292e-04 0.066 0.94715
## addr_stateLA 9.121e-04 5.820e-04 1.567 0.11706
## addr_stateMA 9.482e-05 4.715e-04 0.201 0.84062
## addr_stateMD 7.076e-04 4.875e-04 1.451 0.14665
## addr_stateMI 5.066e-04 5.199e-04 0.975 0.32981
## addr_stateMN 2.356e-04 5.368e-04 0.439 0.66080
## addr_stateMO 1.738e-04 5.251e-04 0.331 0.74069
## addr_stateNC -3.163e-04 5.114e-04 -0.618 0.53626
## addr_stateNJ 7.993e-04 4.549e-04 1.757 0.07891 .
## addr_stateNV 1.194e-03 5.631e-04 2.121 0.03396 *
## addr_stateNY 7.510e-04 4.314e-04 1.741 0.08171 .
## addr_stateOH 1.082e-03 4.769e-04 2.268 0.02333 *
## addr_stateOK 1.731e-03 6.454e-04 2.682 0.00732 **
## addr_stateOR 9.245e-04 5.769e-04 1.603 0.10904
## addr_statePA 4.326e-04 4.644e-04 0.931 0.35161
## addr_stateSC 7.634e-04 5.704e-04 1.338 0.18076
## addr_stateTX 3.915e-04 4.402e-04 0.889 0.37378
## addr_stateUT 1.291e-04 6.754e-04 0.191 0.84841
## addr_stateVA 1.333e-03 4.687e-04 2.843 0.00447 **
## addr_stateWA 8.525e-04 5.055e-04 1.687 0.09171 .
## addr_stateWI 8.030e-04 5.741e-04 1.399 0.16190
## `installment:poly(loan_amnt, 3)1` -4.159e-02 4.787e-04 -86.883 < 2e-16 ***
## `installment:poly(loan_amnt, 3)2` 1.436e-02 2.013e-04 71.342 < 2e-16 ***
## `installment:poly(loan_amnt, 3)3` 3.429e-04 3.614e-05 9.488 < 2e-16 ***
## `annual_inc:term60` 1.208e-08 1.926e-09 6.273 3.58e-10 ***
## `gradeB:price` -4.448e-03 1.966e-04 -22.625 < 2e-16 ***
## `gradeC:price` -5.882e-03 2.186e-04 -26.904 < 2e-16 ***
## `gradeD:price` -9.521e-03 2.538e-04 -37.512 < 2e-16 ***
## `gradeE:price` -1.178e-02 3.249e-04 -36.249 < 2e-16 ***
## `gradeF:price` -1.185e-02 4.794e-04 -24.711 < 2e-16 ***
## `gradeG:price` -1.210e-02 8.844e-04 -13.678 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008665 on 37814 degrees of freedom
## Multiple R-squared: 0.946, Adjusted R-squared: 0.9459
## F-statistic: 1.25e+04 on 53 and 37814 DF, p-value: < 2.2e-16
impr_model3<-train(
int_rate ~ installment*poly(loan_amnt, 3) + annual_inc * term + dti + grade*price + home_ownership,
lc_with_bonds,
method = "lm",
trControl = control
)
summary(impr_model3)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.095165 -0.005042 0.000012 0.005677 0.040363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.924e-02 1.465e-03 -40.426 < 2e-16 ***
## installment 5.150e-04 5.584e-06 92.220 < 2e-16 ***
## `poly(loan_amnt, 3)1` -2.750e+00 6.644e-02 -41.384 < 2e-16 ***
## `poly(loan_amnt, 3)2` 3.422e+00 6.191e-02 55.276 < 2e-16 ***
## `poly(loan_amnt, 3)3` -4.313e+00 5.863e-02 -73.556 < 2e-16 ***
## annual_inc -2.795e-09 8.501e-10 -3.288 0.00101 **
## term60 4.805e-02 5.393e-04 89.090 < 2e-16 ***
## dti 2.176e-05 6.809e-06 3.196 0.00139 **
## gradeB 4.221e-02 5.855e-04 72.095 < 2e-16 ***
## gradeC 6.601e-02 6.711e-04 98.368 < 2e-16 ***
## gradeD 9.366e-02 7.919e-04 118.270 < 2e-16 ***
## gradeE 1.136e-01 1.000e-03 113.595 < 2e-16 ***
## gradeF 1.290e-01 1.452e-03 88.845 < 2e-16 ***
## gradeG 1.420e-01 2.632e-03 53.954 < 2e-16 ***
## price 7.087e-04 1.454e-04 4.874 1.10e-06 ***
## home_ownershipNONE -7.576e-04 5.004e-03 -0.151 0.87967
## home_ownershipOTHER 2.528e-03 9.272e-04 2.727 0.00640 **
## home_ownershipOWN 6.910e-04 1.759e-04 3.928 8.57e-05 ***
## home_ownershipRENT 7.021e-04 9.693e-05 7.244 4.45e-13 ***
## `installment:poly(loan_amnt, 3)1` -4.146e-02 4.785e-04 -86.650 < 2e-16 ***
## `installment:poly(loan_amnt, 3)2` 1.432e-02 2.012e-04 71.144 < 2e-16 ***
## `installment:poly(loan_amnt, 3)3` 3.417e-04 3.611e-05 9.462 < 2e-16 ***
## `annual_inc:term60` 1.244e-08 1.924e-09 6.466 1.02e-10 ***
## `gradeB:price` -4.457e-03 1.965e-04 -22.684 < 2e-16 ***
## `gradeC:price` -5.893e-03 2.185e-04 -26.975 < 2e-16 ***
## `gradeD:price` -9.530e-03 2.536e-04 -37.577 < 2e-16 ***
## `gradeE:price` -1.180e-02 3.246e-04 -36.335 < 2e-16 ***
## `gradeF:price` -1.188e-02 4.790e-04 -24.810 < 2e-16 ***
## `gradeG:price` -1.210e-02 8.839e-04 -13.690 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008662 on 37839 degrees of freedom
## Multiple R-squared: 0.946, Adjusted R-squared: 0.946
## F-statistic: 2.368e+04 on 28 and 37839 DF, p-value: < 2.2e-16
impr_model4<-train(
int_rate ~ installment*poly(loan_amnt, 3) + annual_inc * term *home_ownership + dti + grade*price,
lc_with_bonds,
method = "lm",
trControl = control
)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
print(summary(impr_model4))
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.095313 -0.005066 0.000024 0.005687 0.040543
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.016e-02 1.472e-03 -40.881 < 2e-16
## installment 5.170e-04 5.590e-06 92.484 < 2e-16
## `poly(loan_amnt, 3)1` -2.805e+00 6.696e-02 -41.888 < 2e-16
## `poly(loan_amnt, 3)2` 3.445e+00 6.201e-02 55.548 < 2e-16
## `poly(loan_amnt, 3)3` -4.309e+00 5.861e-02 -73.518 < 2e-16
## annual_inc -9.095e-10 9.934e-10 -0.916 0.359896
## term60 4.941e-02 5.799e-04 85.209 < 2e-16
## home_ownershipNONE 9.082e-03 1.079e-02 0.841 0.400137
## home_ownershipOTHER 3.042e-03 1.352e-03 2.250 0.024443
## home_ownershipOWN 8.296e-04 3.024e-04 2.744 0.006078
## home_ownershipRENT 1.426e-03 1.690e-04 8.438 < 2e-16
## dti 2.132e-05 6.812e-06 3.130 0.001748
## gradeB 4.211e-02 5.854e-04 71.930 < 2e-16
## gradeC 6.593e-02 6.709e-04 98.273 < 2e-16
## gradeD 9.360e-02 7.917e-04 118.234 < 2e-16
## gradeE 1.135e-01 1.000e-03 113.505 < 2e-16
## gradeF 1.288e-01 1.452e-03 88.730 < 2e-16
## gradeG 1.420e-01 2.631e-03 53.976 < 2e-16
## price 6.998e-04 1.453e-04 4.815 1.48e-06
## `installment:poly(loan_amnt, 3)1` -4.152e-02 4.784e-04 -86.779 < 2e-16
## `installment:poly(loan_amnt, 3)2` 1.430e-02 2.012e-04 71.106 < 2e-16
## `installment:poly(loan_amnt, 3)3` 3.406e-04 3.610e-05 9.435 < 2e-16
## `annual_inc:term60` 5.137e-09 2.404e-09 2.137 0.032592
## `annual_inc:home_ownershipNONE` -1.195e-07 1.185e-07 -1.008 0.313387
## `annual_inc:home_ownershipOTHER` -5.310e-09 1.353e-08 -0.392 0.694754
## `annual_inc:home_ownershipOWN` 2.133e-09 3.697e-09 0.577 0.563928
## `annual_inc:home_ownershipRENT` -6.983e-09 1.967e-09 -3.550 0.000385
## `term60:home_ownershipNONE` NA NA NA NA
## `term60:home_ownershipOTHER` 2.423e-03 8.727e-03 0.278 0.781248
## `term60:home_ownershipOWN` -1.862e-03 6.777e-04 -2.747 0.006016
## `term60:home_ownershipRENT` -2.216e-03 3.559e-04 -6.228 4.79e-10
## `gradeB:price` -4.447e-03 1.964e-04 -22.640 < 2e-16
## `gradeC:price` -5.893e-03 2.184e-04 -26.983 < 2e-16
## `gradeD:price` -9.543e-03 2.535e-04 -37.639 < 2e-16
## `gradeE:price` -1.179e-02 3.246e-04 -36.331 < 2e-16
## `gradeF:price` -1.187e-02 4.789e-04 -24.777 < 2e-16
## `gradeG:price` -1.214e-02 8.835e-04 -13.739 < 2e-16
## `annual_inc:term60:home_ownershipNONE` NA NA NA NA
## `annual_inc:term60:home_ownershipOTHER` NA NA NA NA
## `annual_inc:term60:home_ownershipOWN` 1.533e-08 8.568e-09 1.789 0.073567
## `annual_inc:term60:home_ownershipRENT` 1.704e-08 4.117e-09 4.140 3.49e-05
##
## (Intercept) ***
## installment ***
## `poly(loan_amnt, 3)1` ***
## `poly(loan_amnt, 3)2` ***
## `poly(loan_amnt, 3)3` ***
## annual_inc
## term60 ***
## home_ownershipNONE
## home_ownershipOTHER *
## home_ownershipOWN **
## home_ownershipRENT ***
## dti **
## gradeB ***
## gradeC ***
## gradeD ***
## gradeE ***
## gradeF ***
## gradeG ***
## price ***
## `installment:poly(loan_amnt, 3)1` ***
## `installment:poly(loan_amnt, 3)2` ***
## `installment:poly(loan_amnt, 3)3` ***
## `annual_inc:term60` *
## `annual_inc:home_ownershipNONE`
## `annual_inc:home_ownershipOTHER`
## `annual_inc:home_ownershipOWN`
## `annual_inc:home_ownershipRENT` ***
## `term60:home_ownershipNONE`
## `term60:home_ownershipOTHER`
## `term60:home_ownershipOWN` **
## `term60:home_ownershipRENT` ***
## `gradeB:price` ***
## `gradeC:price` ***
## `gradeD:price` ***
## `gradeE:price` ***
## `gradeF:price` ***
## `gradeG:price` ***
## `annual_inc:term60:home_ownershipNONE`
## `annual_inc:term60:home_ownershipOTHER`
## `annual_inc:term60:home_ownershipOWN` .
## `annual_inc:term60:home_ownershipRENT` ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008657 on 37830 degrees of freedom
## Multiple R-squared: 0.9461, Adjusted R-squared: 0.946
## F-statistic: 1.794e+04 on 37 and 37830 DF, p-value: < 2.2e-16
impr_model5<-train(
int_rate ~ installment*poly(loan_amnt, 3) + annual_inc * term + dti + grade*price+ home_ownership*addr_state,
lc_with_bonds,
method = "lm",
trControl = control
)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
summary(impr_model5)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.095254 -0.005035 0.000043 0.005654 0.040146
##
## Coefficients: (32 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.976e-02 1.552e-03 -38.499 < 2e-16 ***
## installment 5.148e-04 5.588e-06 92.122 < 2e-16 ***
## `poly(loan_amnt, 3)1` -2.750e+00 6.648e-02 -41.369 < 2e-16 ***
## `poly(loan_amnt, 3)2` 3.419e+00 6.196e-02 55.183 < 2e-16 ***
## `poly(loan_amnt, 3)3` -4.312e+00 5.866e-02 -73.512 < 2e-16 ***
## annual_inc -2.818e-09 8.527e-10 -3.305 0.000952 ***
## term60 4.803e-02 5.399e-04 88.973 < 2e-16 ***
## dti 2.165e-05 6.845e-06 3.162 0.001566 **
## gradeB 4.221e-02 5.860e-04 72.027 < 2e-16 ***
## gradeC 6.596e-02 6.716e-04 98.214 < 2e-16 ***
## gradeD 9.363e-02 7.926e-04 118.126 < 2e-16 ***
## gradeE 1.136e-01 1.001e-03 113.515 < 2e-16 ***
## gradeF 1.290e-01 1.453e-03 88.777 < 2e-16 ***
## gradeG 1.419e-01 2.632e-03 53.906 < 2e-16 ***
## price 6.977e-04 1.456e-04 4.792 1.66e-06 ***
## home_ownershipNONE 4.630e-03 8.662e-03 0.535 0.592969
## home_ownershipOTHER 5.274e-03 8.675e-03 0.608 0.543204
## home_ownershipOWN 6.153e-04 1.195e-03 0.515 0.606637
## home_ownershipRENT 1.528e-03 1.019e-03 1.500 0.133681
## addr_stateAZ 1.050e-03 6.397e-04 1.642 0.100573
## addr_stateCA 7.402e-04 5.389e-04 1.374 0.169566
## addr_stateCO 4.233e-04 6.670e-04 0.635 0.525699
## addr_stateCT -2.467e-04 6.897e-04 -0.358 0.720592
## addr_stateFL 7.101e-04 5.585e-04 1.272 0.203522
## addr_stateGA 5.718e-04 5.874e-04 0.973 0.330362
## addr_stateIL 8.291e-04 5.985e-04 1.385 0.165965
## addr_stateKS 2.582e-05 8.603e-04 0.030 0.976059
## addr_stateKY 7.001e-04 7.855e-04 0.891 0.372758
## addr_stateLA 1.422e-03 7.625e-04 1.864 0.062277 .
## addr_stateMA 5.990e-04 6.371e-04 0.940 0.347108
## addr_stateMD 7.235e-04 6.374e-04 1.135 0.256357
## addr_stateMI 7.966e-04 6.499e-04 1.226 0.220365
## addr_stateMN -2.436e-04 6.927e-04 -0.352 0.725090
## addr_stateMO -3.770e-04 6.584e-04 -0.573 0.566910
## addr_stateNC 2.570e-04 6.489e-04 0.396 0.692037
## addr_stateNJ 5.860e-04 6.114e-04 0.959 0.337799
## addr_stateNV 1.260e-03 7.360e-04 1.712 0.086873 .
## addr_stateNY -2.379e-05 5.852e-04 -0.041 0.967574
## addr_stateOH 1.112e-03 6.053e-04 1.838 0.066141 .
## addr_stateOK 1.725e-03 8.018e-04 2.152 0.031405 *
## addr_stateOR 4.922e-04 8.150e-04 0.604 0.545910
## addr_statePA 1.447e-04 6.019e-04 0.240 0.810038
## addr_stateSC 4.930e-04 7.203e-04 0.684 0.493728
## addr_stateTX 6.093e-04 5.547e-04 1.099 0.271991
## addr_stateUT -2.646e-04 8.567e-04 -0.309 0.757451
## addr_stateVA 1.461e-03 6.070e-04 2.406 0.016122 *
## addr_stateWA 4.611e-04 6.782e-04 0.680 0.496601
## addr_stateWI 1.244e-03 7.536e-04 1.651 0.098673 .
## `installment:poly(loan_amnt, 3)1` -4.144e-02 4.788e-04 -86.566 < 2e-16 ***
## `installment:poly(loan_amnt, 3)2` 1.431e-02 2.013e-04 71.101 < 2e-16 ***
## `installment:poly(loan_amnt, 3)3` 3.419e-04 3.614e-05 9.461 < 2e-16 ***
## `annual_inc:term60` 1.237e-08 1.926e-09 6.422 1.36e-10 ***
## `gradeB:price` -4.461e-03 1.967e-04 -22.677 < 2e-16 ***
## `gradeC:price` -5.881e-03 2.187e-04 -26.893 < 2e-16 ***
## `gradeD:price` -9.522e-03 2.539e-04 -37.508 < 2e-16 ***
## `gradeE:price` -1.178e-02 3.248e-04 -36.275 < 2e-16 ***
## `gradeF:price` -1.189e-02 4.793e-04 -24.800 < 2e-16 ***
## `gradeG:price` -1.206e-02 8.838e-04 -13.640 < 2e-16 ***
## `home_ownershipNONE:addr_stateAZ` NA NA NA NA
## `home_ownershipOTHER:addr_stateAZ` -4.123e-03 9.508e-03 -0.434 0.664570
## `home_ownershipOWN:addr_stateAZ` -2.364e-03 1.772e-03 -1.334 0.182307
## `home_ownershipRENT:addr_stateAZ` -7.446e-04 1.188e-03 -0.627 0.530933
## `home_ownershipNONE:addr_stateCA` -4.401e-03 1.225e-02 -0.359 0.719381
## `home_ownershipOTHER:addr_stateCA` -4.577e-03 8.881e-03 -0.515 0.606261
## `home_ownershipOWN:addr_stateCA` 5.813e-04 1.283e-03 0.453 0.650466
## `home_ownershipRENT:addr_stateCA` -1.304e-03 1.043e-03 -1.251 0.211113
## `home_ownershipNONE:addr_stateCO` NA NA NA NA
## `home_ownershipOTHER:addr_stateCO` -2.144e-03 9.510e-03 -0.225 0.821603
## `home_ownershipOWN:addr_stateCO` 1.626e-04 1.923e-03 0.085 0.932623
## `home_ownershipRENT:addr_stateCO` -1.269e-03 1.199e-03 -1.058 0.289920
## `home_ownershipNONE:addr_stateCT` NA NA NA NA
## `home_ownershipOTHER:addr_stateCT` 1.730e-02 1.226e-02 1.411 0.158249
## `home_ownershipOWN:addr_stateCT` 2.697e-03 1.684e-03 1.601 0.109294
## `home_ownershipRENT:addr_stateCT` -1.467e-04 1.213e-03 -0.121 0.903772
## `home_ownershipNONE:addr_stateFL` NA NA NA NA
## `home_ownershipOTHER:addr_stateFL` 5.742e-03 9.698e-03 0.592 0.553797
## `home_ownershipOWN:addr_stateFL` 1.650e-04 1.331e-03 0.124 0.901363
## `home_ownershipRENT:addr_stateFL` -8.773e-04 1.073e-03 -0.817 0.413746
## `home_ownershipNONE:addr_stateGA` NA NA NA NA
## `home_ownershipOTHER:addr_stateGA` 3.587e-04 9.373e-03 0.038 0.969472
## `home_ownershipOWN:addr_stateGA` -1.148e-03 1.508e-03 -0.762 0.446224
## `home_ownershipRENT:addr_stateGA` -1.153e-03 1.137e-03 -1.014 0.310706
## `home_ownershipNONE:addr_stateIL` NA NA NA NA
## `home_ownershipOTHER:addr_stateIL` 8.813e-04 1.002e-02 0.088 0.929902
## `home_ownershipOWN:addr_stateIL` 1.082e-04 1.505e-03 0.072 0.942698
## `home_ownershipRENT:addr_stateIL` -8.345e-04 1.118e-03 -0.747 0.455229
## `home_ownershipNONE:addr_stateKS` NA NA NA NA
## `home_ownershipOTHER:addr_stateKS` -9.701e-03 1.064e-02 -0.912 0.361858
## `home_ownershipOWN:addr_stateKS` -3.243e-05 2.100e-03 -0.015 0.987676
## `home_ownershipRENT:addr_stateKS` -6.979e-04 1.550e-03 -0.450 0.652591
## `home_ownershipNONE:addr_stateKY` NA NA NA NA
## `home_ownershipOTHER:addr_stateKY` NA NA NA NA
## `home_ownershipOWN:addr_stateKY` -3.185e-03 2.032e-03 -1.567 0.117115
## `home_ownershipRENT:addr_stateKY` -1.505e-03 1.503e-03 -1.001 0.316700
## `home_ownershipNONE:addr_stateLA` NA NA NA NA
## `home_ownershipOTHER:addr_stateLA` 3.613e-03 1.227e-02 0.295 0.768366
## `home_ownershipOWN:addr_stateLA` -1.655e-03 1.779e-03 -0.930 0.352309
## `home_ownershipRENT:addr_stateLA` -1.536e-03 1.363e-03 -1.127 0.259712
## `home_ownershipNONE:addr_stateMA` -1.269e-02 1.225e-02 -1.036 0.300290
## `home_ownershipOTHER:addr_stateMA` -1.423e-02 9.703e-03 -1.466 0.142578
## `home_ownershipOWN:addr_stateMA` 5.636e-04 1.523e-03 0.370 0.711281
## `home_ownershipRENT:addr_stateMA` -1.813e-03 1.135e-03 -1.598 0.110125
## `home_ownershipNONE:addr_stateMD` NA NA NA NA
## `home_ownershipOTHER:addr_stateMD` -5.516e-03 1.062e-02 -0.519 0.603659
## `home_ownershipOWN:addr_stateMD` -6.560e-04 1.605e-03 -0.409 0.682763
## `home_ownershipRENT:addr_stateMD` -6.631e-04 1.160e-03 -0.572 0.567412
## `home_ownershipNONE:addr_stateMI` NA NA NA NA
## `home_ownershipOTHER:addr_stateMI` -6.536e-04 1.062e-02 -0.062 0.950943
## `home_ownershipOWN:addr_stateMI` -8.151e-04 1.623e-03 -0.502 0.615536
## `home_ownershipRENT:addr_stateMI` -1.069e-03 1.259e-03 -0.849 0.395654
## `home_ownershipNONE:addr_stateMN` NA NA NA NA
## `home_ownershipOTHER:addr_stateMN` 8.259e-03 1.226e-02 0.673 0.500661
## `home_ownershipOWN:addr_stateMN` -1.986e-03 1.842e-03 -1.078 0.280901
## `home_ownershipRENT:addr_stateMN` 1.009e-03 1.256e-03 0.803 0.421916
## `home_ownershipNONE:addr_stateMO` NA NA NA NA
## `home_ownershipOTHER:addr_stateMO` 7.066e-04 1.226e-02 0.058 0.954055
## `home_ownershipOWN:addr_stateMO` 2.031e-03 1.722e-03 1.179 0.238441
## `home_ownershipRENT:addr_stateMO` 9.549e-04 1.256e-03 0.760 0.447169
## `home_ownershipNONE:addr_stateNC` NA NA NA NA
## `home_ownershipOTHER:addr_stateNC` -2.134e-02 1.226e-02 -1.740 0.081828 .
## `home_ownershipOWN:addr_stateNC` -2.403e-03 1.698e-03 -1.415 0.157167
## `home_ownershipRENT:addr_stateNC` -1.574e-03 1.215e-03 -1.296 0.194976
## `home_ownershipNONE:addr_stateNJ` NA NA NA NA
## `home_ownershipOTHER:addr_stateNJ` -6.708e-03 1.002e-02 -0.670 0.503100
## `home_ownershipOWN:addr_stateNJ` -2.890e-04 1.407e-03 -0.205 0.837256
## `home_ownershipRENT:addr_stateNJ` -4.774e-04 1.107e-03 -0.431 0.666367
## `home_ownershipNONE:addr_stateNV` NA NA NA NA
## `home_ownershipOTHER:addr_stateNV` 7.785e-03 1.227e-02 0.635 0.525639
## `home_ownershipOWN:addr_stateNV` 9.118e-04 2.424e-03 0.376 0.706803
## `home_ownershipRENT:addr_stateNV` -9.080e-04 1.292e-03 -0.703 0.482201
## `home_ownershipNONE:addr_stateNY` NA NA NA NA
## `home_ownershipOTHER:addr_stateNY` -4.513e-03 9.276e-03 -0.486 0.626621
## `home_ownershipOWN:addr_stateNY` 1.500e-03 1.315e-03 1.140 0.254111
## `home_ownershipRENT:addr_stateNY` -6.618e-05 1.073e-03 -0.062 0.950829
## `home_ownershipNONE:addr_stateOH` NA NA NA NA
## `home_ownershipOTHER:addr_stateOH` 1.111e-02 1.062e-02 1.046 0.295635
## `home_ownershipOWN:addr_stateOH` -3.448e-04 1.520e-03 -0.227 0.820503
## `home_ownershipRENT:addr_stateOH` -5.917e-04 1.147e-03 -0.516 0.605953
## `home_ownershipNONE:addr_stateOK` NA NA NA NA
## `home_ownershipOTHER:addr_stateOK` NA NA NA NA
## `home_ownershipOWN:addr_stateOK` -1.630e-03 2.119e-03 -0.769 0.441697
## `home_ownershipRENT:addr_stateOK` 4.319e-04 1.544e-03 0.280 0.779678
## `home_ownershipNONE:addr_stateOR` NA NA NA NA
## `home_ownershipOTHER:addr_stateOR` -1.120e-02 1.227e-02 -0.912 0.361564
## `home_ownershipOWN:addr_stateOR` 2.571e-03 2.553e-03 1.007 0.313964
## `home_ownershipRENT:addr_stateOR` -1.657e-04 1.321e-03 -0.125 0.900151
## `home_ownershipNONE:addr_statePA` NA NA NA NA
## `home_ownershipOTHER:addr_statePA` -1.774e-03 1.226e-02 -0.145 0.884967
## `home_ownershipOWN:addr_statePA` 1.829e-04 1.423e-03 0.129 0.897737
## `home_ownershipRENT:addr_statePA` -9.225e-05 1.121e-03 -0.082 0.934420
## `home_ownershipNONE:addr_stateSC` NA NA NA NA
## `home_ownershipOTHER:addr_stateSC` -2.099e-03 1.227e-02 -0.171 0.864121
## `home_ownershipOWN:addr_stateSC` 8.477e-04 1.803e-03 0.470 0.638206
## `home_ownershipRENT:addr_stateSC` 2.882e-04 1.359e-03 0.212 0.832029
## `home_ownershipNONE:addr_stateTX` NA NA NA NA
## `home_ownershipOTHER:addr_stateTX` -4.885e-03 9.370e-03 -0.521 0.602162
## `home_ownershipOWN:addr_stateTX` 1.495e-04 1.328e-03 0.113 0.910338
## `home_ownershipRENT:addr_stateTX` -1.217e-03 1.079e-03 -1.128 0.259301
## `home_ownershipNONE:addr_stateUT` NA NA NA NA
## `home_ownershipOTHER:addr_stateUT` NA NA NA NA
## `home_ownershipOWN:addr_stateUT` 3.402e-04 3.552e-03 0.096 0.923685
## `home_ownershipRENT:addr_stateUT` 6.040e-04 1.518e-03 0.398 0.690650
## `home_ownershipNONE:addr_stateVA` NA NA NA NA
## `home_ownershipOTHER:addr_stateVA` -1.054e-05 9.373e-03 -0.001 0.999102
## `home_ownershipOWN:addr_stateVA` -5.277e-06 1.531e-03 -0.003 0.997251
## `home_ownershipRENT:addr_stateVA` -1.066e-03 1.125e-03 -0.947 0.343615
## `home_ownershipNONE:addr_stateWA` NA NA NA NA
## `home_ownershipOTHER:addr_stateWA` NA NA NA NA
## `home_ownershipOWN:addr_stateWA` 8.202e-04 1.958e-03 0.419 0.675353
## `home_ownershipRENT:addr_stateWA` -8.587e-05 1.188e-03 -0.072 0.942368
## `home_ownershipNONE:addr_stateWI` NA NA NA NA
## `home_ownershipOTHER:addr_stateWI` NA NA NA NA
## `home_ownershipOWN:addr_stateWI` -4.162e-03 2.268e-03 -1.835 0.066539 .
## `home_ownershipRENT:addr_stateWI` -1.234e-03 1.316e-03 -0.938 0.348307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008654 on 37726 degrees of freedom
## Multiple R-squared: 0.9463, Adjusted R-squared: 0.9461
## F-statistic: 4712 on 141 and 37726 DF, p-value: < 2.2e-16
anova(impr_model1$finalModel, impr_model2$finalModel)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.78e+04 | 2.84 | ||||
3.78e+04 | 2.84 | 29 | 0.00427 | 1.96 | 0.00148 |
anova(impr_model2$finalModel, impr_model3$finalModel)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.78e+04 | 2.84 | ||||
3.78e+04 | 2.84 | -25 | 0.000318 |
anova(impr_model3$finalModel, impr_model4$finalModel)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.78e+04 | 2.84 | ||||
3.78e+04 | 2.84 | 9 | 0.00376 | 5.57 | 1.01e-07 |
anova(impr_model4$finalModel, impr_model5$finalModel)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.78e+04 | 2.84 | ||||
3.77e+04 | 2.83 | 104 | 0.00941 | 1.21 | 0.0731 |
anova(impr_model1$finalModel,impr_model2$finalModel,impr_model3$finalModel,impr_model4$finalModel,impr_model5$finalModel)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.78e+04 | 2.84 | ||||
3.78e+04 | 2.84 | 29 | 0.00427 | 1.97 | 0.00143 |
3.78e+04 | 2.84 | -25 | 0.000318 | ||
3.78e+04 | 2.84 | 9 | 0.00376 | 5.58 | 1e-07 |
3.77e+04 | 2.83 | 104 | 0.00941 | 1.21 | 0.0731 |
unique(lc_with_bonds$home_ownership)
## [1] "MORTGAGE" "RENT" "OWN" "NONE" "OTHER"
set.seed(1234)
train_test_split <- initial_split(lc_with_bonds, prop = 0.5)
training <- training(train_test_split)
testing <- testing(train_test_split)
control <- trainControl (
method="cv",
number=10,
verboseIter=F) #by setting this to true the model will report its progress after each estimation
lambda_seq <- seq(0, 0.01, length = 1000)
# lasso regression using k-fold cross validation to select the best lambda
lasso <- train(
int_rate ~ installment*poly(loan_amnt,3) + term+ dti + grade*price +grade:poly(loan_amnt,3):term +poly(loan_amnt,3):term +grade:term+addr_state*annual_inc,
data = training,
method = "glmnet",
preProc = c("center", "scale"), #This option standardizes the data before running the LASSO regression
trControl = control,
tuneGrid = expand.grid(alpha = 1, lambda = lambda_seq) #alpha=1 specifies to run a LASSO regression. If alpha=0 the model would run ridge regression.
)
# Model coefficients
coef(lasso$finalModel, lasso$bestTune$lambda)
## 127 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.203647e-01
## installment 4.111018e-01
## poly(loan_amnt, 3)1 -1.171054e-01
## poly(loan_amnt, 3)2 8.483312e-02
## poly(loan_amnt, 3)3 -4.871093e-02
## term60 8.243660e-02
## dti 3.889770e-05
## gradeB 5.557754e-03
## gradeC 5.825878e-03
## gradeD 6.397624e-03
## gradeE 4.643242e-03
## gradeF 2.752099e-03
## gradeG 1.900519e-03
## price 7.329619e-05
## addr_stateAZ -2.194825e-06
## addr_stateCA -1.061541e-04
## addr_stateCO -9.051871e-05
## addr_stateCT -3.691006e-05
## addr_stateFL 3.579370e-05
## addr_stateGA -5.010631e-05
## addr_stateIL 2.738985e-05
## addr_stateKS 6.940052e-05
## addr_stateKY -1.663090e-05
## addr_stateLA 3.945590e-05
## addr_stateMA -8.512827e-05
## addr_stateMD -1.871328e-05
## addr_stateMI -1.416929e-04
## addr_stateMN 1.275859e-04
## addr_stateMO 2.630119e-05
## addr_stateNC -6.236780e-05
## addr_stateNJ 5.181981e-05
## addr_stateNV 3.418857e-05
## addr_stateNY -1.929327e-05
## addr_stateOH .
## addr_stateOK 4.650421e-05
## addr_stateOR 6.212557e-05
## addr_statePA -3.659277e-05
## addr_stateSC .
## addr_stateTX -8.230856e-06
## addr_stateUT 2.841013e-05
## addr_stateVA .
## addr_stateWA 6.170835e-05
## addr_stateWI 1.561012e-05
## annual_inc -7.053377e-05
## installment:poly(loan_amnt, 3)1 -3.806753e-01
## installment:poly(loan_amnt, 3)2 1.103283e-01
## installment:poly(loan_amnt, 3)3 -1.866514e-04
## gradeB:price -2.036577e-03
## gradeC:price -8.551595e-04
## gradeD:price -8.501701e-04
## gradeE:price 2.524762e-04
## gradeF:price 3.983051e-04
## gradeG:price 2.132233e-04
## poly(loan_amnt, 3)1:term60 2.269658e-02
## poly(loan_amnt, 3)2:term60 -2.482986e-02
## poly(loan_amnt, 3)3:term60 7.980065e-03
## term60:gradeB -3.120612e-04
## term60:gradeC -4.344084e-04
## term60:gradeD -7.147718e-04
## term60:gradeE -8.231259e-04
## term60:gradeF -5.234864e-04
## term60:gradeG -4.474709e-04
## addr_stateAZ:annual_inc -3.413409e-06
## addr_stateCA:annual_inc 6.762994e-05
## addr_stateCO:annual_inc 8.266607e-05
## addr_stateCT:annual_inc 1.835827e-05
## addr_stateFL:annual_inc -2.199002e-05
## addr_stateGA:annual_inc -2.214634e-05
## addr_stateIL:annual_inc 4.284712e-05
## addr_stateKS:annual_inc -9.398596e-05
## addr_stateKY:annual_inc 1.829080e-05
## addr_stateLA:annual_inc -2.220243e-05
## addr_stateMA:annual_inc -2.487178e-07
## addr_stateMD:annual_inc 4.627167e-05
## addr_stateMI:annual_inc 1.581939e-04
## addr_stateMN:annual_inc -1.946698e-04
## addr_stateMO:annual_inc .
## addr_stateNC:annual_inc 2.821883e-05
## addr_stateNJ:annual_inc -1.052273e-05
## addr_stateNV:annual_inc -2.029405e-05
## addr_stateNY:annual_inc 4.314602e-05
## addr_stateOH:annual_inc .
## addr_stateOK:annual_inc -3.311027e-05
## addr_stateOR:annual_inc -7.388197e-05
## addr_statePA:annual_inc 7.091587e-05
## addr_stateSC:annual_inc -6.402034e-05
## addr_stateTX:annual_inc 2.171293e-05
## addr_stateUT:annual_inc -2.186497e-05
## addr_stateVA:annual_inc 3.498456e-06
## addr_stateWA:annual_inc -4.060757e-05
## addr_stateWI:annual_inc -4.512479e-05
## poly(loan_amnt, 3)1:term36:gradeB -2.734440e-03
## poly(loan_amnt, 3)2:term36:gradeB 1.987322e-03
## poly(loan_amnt, 3)3:term36:gradeB -1.139147e-03
## poly(loan_amnt, 3)1:term60:gradeB 4.988954e-07
## poly(loan_amnt, 3)2:term60:gradeB 3.355454e-03
## poly(loan_amnt, 3)3:term60:gradeB 6.513913e-04
## poly(loan_amnt, 3)1:term36:gradeC -3.616482e-03
## poly(loan_amnt, 3)2:term36:gradeC 2.749295e-03
## poly(loan_amnt, 3)3:term36:gradeC -1.632666e-03
## poly(loan_amnt, 3)1:term60:gradeC -1.677279e-03
## poly(loan_amnt, 3)2:term60:gradeC 4.288533e-03
## poly(loan_amnt, 3)3:term60:gradeC -2.093729e-05
## poly(loan_amnt, 3)1:term36:gradeD -4.021553e-03
## poly(loan_amnt, 3)2:term36:gradeD 2.796767e-03
## poly(loan_amnt, 3)3:term36:gradeD -1.789983e-03
## poly(loan_amnt, 3)1:term60:gradeD -2.543340e-03
## poly(loan_amnt, 3)2:term60:gradeD 4.598224e-03
## poly(loan_amnt, 3)3:term60:gradeD -4.654168e-04
## poly(loan_amnt, 3)1:term36:gradeE -2.741510e-03
## poly(loan_amnt, 3)2:term36:gradeE 1.742129e-03
## poly(loan_amnt, 3)3:term36:gradeE -1.161179e-03
## poly(loan_amnt, 3)1:term60:gradeE -4.209877e-03
## poly(loan_amnt, 3)2:term60:gradeE 6.825045e-03
## poly(loan_amnt, 3)3:term60:gradeE -9.890742e-04
## poly(loan_amnt, 3)1:term36:gradeF -1.612771e-03
## poly(loan_amnt, 3)2:term36:gradeF 6.918419e-04
## poly(loan_amnt, 3)3:term36:gradeF -6.966659e-04
## poly(loan_amnt, 3)1:term60:gradeF -4.133441e-03
## poly(loan_amnt, 3)2:term60:gradeF 6.051258e-03
## poly(loan_amnt, 3)3:term60:gradeF -1.127521e-03
## poly(loan_amnt, 3)1:term36:gradeG -1.188380e-03
## poly(loan_amnt, 3)2:term36:gradeG 6.851644e-04
## poly(loan_amnt, 3)3:term36:gradeG -4.743260e-04
## poly(loan_amnt, 3)1:term60:gradeG -3.074357e-03
## poly(loan_amnt, 3)2:term60:gradeG 3.814081e-03
## poly(loan_amnt, 3)3:term60:gradeG -1.004517e-03
# Best lambda
lasso$bestTune$lambda
## [1] 0
# Count of how many coefficients are greater than zero and how many are equal to zero
sum(coef(lasso$finalModel, lasso$bestTune$lambda)!=0)
## [1] 122
sum(coef(lasso$finalModel, lasso$bestTune$lambda)==0)
## [1] 5
# Make predictions
predictions <- predict(lasso,testing)
# Model prediction performance
data.frame(
RMSE = RMSE(predictions, testing$int_rate),
Rsquare = R2(predictions, testing$int_rate)
)
RMSE | Rsquare |
---|---|
0.00414 | 0.988 |
After trying out multiple models we conclude that model 4 has the lowest and highest R-squared but the improvements in predictive power are small at this point. Model 5th had a significant improvement at the 90% confidence level over model 4 but it also introduced a lot of non significant explanatory variables from the homeownership state interaction. Between the Linear Models: The model 4 adjusted r-squared is 0.946. The residual standard error is 0.0086657, this means that the 95% confidence interval of a the prediction will contain \(CI = \left[- 1.96*RSE; +1.96*RSE \right] = \left[-0.01698477;0.01698477 \right] = \left[-1.67%;1.67% \right]\) The features of model 4 can be seen within the model summary. Using a LASSO regression results in much better preditive power of 98.75% and a confidence interval of \(CI = \left[-0.81504%;0.81504% \right]\). For this model only 50% of the data was used for training as it could be seen from earlier training curves that is enough. The reason for the better performance of this model is likely to be attributed to the fact that LASSO drives the non-significant variables to zero which the other models can’t thus retaining only the significant predictors from variable interactions.
Using other publicly available datasets to further improve performance (e.g., quarterly data on US inflation or CPI).
fed_raw <- read_csv(here::here("csv","CPALTT01USQ657N.csv")) %>% #since the first row is a title we want to skip it.
clean_names() # use janitor::clean_names()
## Rows: 246 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): CPALTT01USQ657N
## date (1): DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(fed_raw)
Name | fed_raw |
Number of rows | 246 |
Number of columns | 2 |
_______________________ | |
Column type frequency: | |
Date | 1 |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
date | 0 | 1 | 1960-01-01 | 2021-04-01 | 1990-08-16 | 246 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
cpaltt01usq657n | 0 | 1 | 0.91 | 0.81 | -2.83 | 0.4 | 0.8 | 1.2 | 3.95 | ▁▁▇▂▁ |
fed_raw<-fed_raw%>%
mutate(yq = as.factor(as.yearqtr(fed_raw$date, format = "%Y-%m-%d")))
lc_with_bonds <- lc_with_bonds%>%
mutate(date = issue_d)%>%
mutate(yq = as.factor(as.yearqtr(lc_with_bonds$issue_d, format = "%Y-%m-%d")))
lc_with_bonds_inf<-lc_with_bonds%>%
left_join(fed_raw, by = "yq")%>%
rename(inflation = cpaltt01usq657n)
lc_with_bonds_inf%>%
ggplot()+
geom_point(aes(y = int_rate, x = inflation, color = grade))+
geom_smooth(aes(y = int_rate, x = inflation, color = grade), method = "lm")+
theme_bw()+
labs(title = "Inflation vs Interest Rate",
subtitle = "Grouped by Grades",
x = "inflation",
y = "interest rate")
## `geom_smooth()` using formula 'y ~ x'
set.seed(1234)
train_test_split <- initial_split(lc_with_bonds, prop = 0.5)
training <- training(train_test_split)
testing <- testing(train_test_split)
control <- trainControl (
method="cv",
number=10,
verboseIter=F) #by setting this to true the model will report its progress after each estimation
inf_model1<-train(
int_rate ~ installment*poly(loan_amnt, 3) + annual_inc * term + dti + grade*price+ home_ownership*addr_state + grade * inflation,
lc_with_bonds_inf,
method = "lm",
trControl = control
)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
summary(inf_model1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.096277 -0.005242 0.000319 0.005463 0.035566
##
## Coefficients: (32 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.891e-02 1.519e-03 -38.779 < 2e-16 ***
## installment 4.879e-04 5.476e-06 89.102 < 2e-16 ***
## `poly(loan_amnt, 3)1` -2.652e+00 6.474e-02 -40.962 < 2e-16 ***
## `poly(loan_amnt, 3)2` 3.270e+00 6.042e-02 54.124 < 2e-16 ***
## `poly(loan_amnt, 3)3` -4.055e+00 5.744e-02 -70.594 < 2e-16 ***
## annual_inc -2.832e-09 8.299e-10 -3.413 0.000644 ***
## term60 4.519e-02 5.303e-04 85.222 < 2e-16 ***
## dti 2.841e-05 6.667e-06 4.261 2.04e-05 ***
## gradeB 4.716e-02 6.106e-04 77.236 < 2e-16 ***
## gradeC 7.394e-02 6.965e-04 106.155 < 2e-16 ***
## gradeD 1.046e-01 8.282e-04 126.254 < 2e-16 ***
## gradeE 1.284e-01 1.084e-03 118.395 < 2e-16 ***
## gradeF 1.465e-01 1.635e-03 89.636 < 2e-16 ***
## gradeG 1.551e-01 2.905e-03 53.390 < 2e-16 ***
## price 3.677e-03 1.637e-04 22.456 < 2e-16 ***
## home_ownershipNONE 5.363e-03 8.429e-03 0.636 0.524595
## home_ownershipOTHER -9.621e-03 8.452e-03 -1.138 0.254977
## home_ownershipOWN 5.984e-04 1.163e-03 0.515 0.606831
## home_ownershipRENT 1.485e-03 9.914e-04 1.498 0.134137
## addr_stateAZ 1.006e-03 6.225e-04 1.616 0.106102
## addr_stateCA 6.273e-04 5.244e-04 1.196 0.231651
## addr_stateCO 2.518e-04 6.491e-04 0.388 0.698107
## addr_stateCT -3.255e-04 6.711e-04 -0.485 0.627649
## addr_stateFL 6.436e-04 5.435e-04 1.184 0.236324
## addr_stateGA 3.312e-04 5.717e-04 0.579 0.562328
## addr_stateIL 8.089e-04 5.824e-04 1.389 0.164858
## addr_stateKS 3.238e-04 8.373e-04 0.387 0.698952
## addr_stateKY 6.648e-04 7.645e-04 0.870 0.384546
## addr_stateLA 1.445e-03 7.420e-04 1.948 0.051441 .
## addr_stateMA 3.823e-04 6.200e-04 0.617 0.537474
## addr_stateMD 6.756e-04 6.204e-04 1.089 0.276140
## addr_stateMI 7.079e-04 6.325e-04 1.119 0.263034
## addr_stateMN -2.433e-04 6.741e-04 -0.361 0.718148
## addr_stateMO -5.272e-04 6.407e-04 -0.823 0.410622
## addr_stateNC 3.112e-04 6.317e-04 0.493 0.622338
## addr_stateNJ 6.075e-04 5.949e-04 1.021 0.307177
## addr_stateNV 1.235e-03 7.163e-04 1.724 0.084722 .
## addr_stateNY -9.934e-05 5.695e-04 -0.174 0.861529
## addr_stateOH 1.006e-03 5.890e-04 1.707 0.087768 .
## addr_stateOK 1.448e-03 7.803e-04 1.856 0.063460 .
## addr_stateOR 2.943e-04 7.930e-04 0.371 0.710608
## addr_statePA 8.933e-05 5.857e-04 0.153 0.878777
## addr_stateSC 3.588e-04 7.010e-04 0.512 0.608770
## addr_stateTX 5.628e-04 5.398e-04 1.043 0.297112
## addr_stateUT -1.310e-04 8.337e-04 -0.157 0.875089
## addr_stateVA 1.568e-03 5.907e-04 2.654 0.007957 **
## addr_stateWA 5.591e-04 6.601e-04 0.847 0.396978
## addr_stateWI 1.009e-03 7.333e-04 1.376 0.168699
## inflation -5.073e-03 1.397e-04 -36.305 < 2e-16 ***
## `installment:poly(loan_amnt, 3)1` -3.913e-02 4.693e-04 -83.396 < 2e-16 ***
## `installment:poly(loan_amnt, 3)2` 1.343e-02 1.972e-04 68.117 < 2e-16 ***
## `installment:poly(loan_amnt, 3)3` 3.175e-04 3.519e-05 9.023 < 2e-16 ***
## `annual_inc:term60` 1.183e-08 1.874e-09 6.310 2.83e-10 ***
## `gradeB:price` -6.737e-03 2.205e-04 -30.553 < 2e-16 ***
## `gradeC:price` -9.503e-03 2.422e-04 -39.242 < 2e-16 ***
## `gradeD:price` -1.445e-02 2.846e-04 -50.790 < 2e-16 ***
## `gradeE:price` -1.830e-02 3.833e-04 -47.743 < 2e-16 ***
## `gradeF:price` -1.969e-02 6.107e-04 -32.237 < 2e-16 ***
## `gradeG:price` -1.754e-02 1.088e-03 -16.116 < 2e-16 ***
## `home_ownershipNONE:addr_stateAZ` NA NA NA NA
## `home_ownershipOTHER:addr_stateAZ` 9.409e-03 9.262e-03 1.016 0.309698
## `home_ownershipOWN:addr_stateAZ` -2.198e-03 1.725e-03 -1.274 0.202571
## `home_ownershipRENT:addr_stateAZ` -7.921e-04 1.156e-03 -0.685 0.493370
## `home_ownershipNONE:addr_stateCA` -6.869e-03 1.192e-02 -0.576 0.564433
## `home_ownershipOTHER:addr_stateCA` 9.469e-03 8.650e-03 1.095 0.273670
## `home_ownershipOWN:addr_stateCA` 5.735e-04 1.248e-03 0.459 0.645954
## `home_ownershipRENT:addr_stateCA` -1.235e-03 1.015e-03 -1.217 0.223794
## `home_ownershipNONE:addr_stateCO` NA NA NA NA
## `home_ownershipOTHER:addr_stateCO` 9.470e-03 9.259e-03 1.023 0.306391
## `home_ownershipOWN:addr_stateCO` 1.826e-04 1.871e-03 0.098 0.922275
## `home_ownershipRENT:addr_stateCO` -1.085e-03 1.167e-03 -0.930 0.352489
## `home_ownershipNONE:addr_stateCT` NA NA NA NA
## `home_ownershipOTHER:addr_stateCT` 2.754e-02 1.194e-02 2.308 0.021027 *
## `home_ownershipOWN:addr_stateCT` 2.249e-03 1.639e-03 1.372 0.169940
## `home_ownershipRENT:addr_stateCT` -1.041e-04 1.181e-03 -0.088 0.929771
## `home_ownershipNONE:addr_stateFL` NA NA NA NA
## `home_ownershipOTHER:addr_stateFL` 1.928e-02 9.444e-03 2.041 0.041253 *
## `home_ownershipOWN:addr_stateFL` 7.288e-05 1.296e-03 0.056 0.955143
## `home_ownershipRENT:addr_stateFL` -8.459e-04 1.045e-03 -0.810 0.418046
## `home_ownershipNONE:addr_stateGA` NA NA NA NA
## `home_ownershipOTHER:addr_stateGA` 1.209e-02 9.127e-03 1.325 0.185273
## `home_ownershipOWN:addr_stateGA` -1.137e-03 1.467e-03 -0.775 0.438356
## `home_ownershipRENT:addr_stateGA` -1.031e-03 1.107e-03 -0.932 0.351412
## `home_ownershipNONE:addr_stateIL` NA NA NA NA
## `home_ownershipOTHER:addr_stateIL` 1.463e-02 9.755e-03 1.499 0.133808
## `home_ownershipOWN:addr_stateIL` 1.378e-04 1.464e-03 0.094 0.925037
## `home_ownershipRENT:addr_stateIL` -9.565e-04 1.087e-03 -0.880 0.379114
## `home_ownershipNONE:addr_stateKS` NA NA NA NA
## `home_ownershipOTHER:addr_stateKS` 1.203e-02 1.037e-02 1.160 0.245910
## `home_ownershipOWN:addr_stateKS` -3.691e-04 2.043e-03 -0.181 0.856648
## `home_ownershipRENT:addr_stateKS` -1.179e-03 1.509e-03 -0.781 0.434650
## `home_ownershipNONE:addr_stateKY` NA NA NA NA
## `home_ownershipOTHER:addr_stateKY` NA NA NA NA
## `home_ownershipOWN:addr_stateKY` -3.197e-03 1.978e-03 -1.617 0.105930
## `home_ownershipRENT:addr_stateKY` -1.728e-03 1.463e-03 -1.181 0.237498
## `home_ownershipNONE:addr_stateLA` NA NA NA NA
## `home_ownershipOTHER:addr_stateLA` 1.860e-02 1.194e-02 1.557 0.119429
## `home_ownershipOWN:addr_stateLA` -1.720e-03 1.731e-03 -0.993 0.320505
## `home_ownershipRENT:addr_stateLA` -1.555e-03 1.327e-03 -1.173 0.240998
## `home_ownershipNONE:addr_stateMA` -1.514e-02 1.193e-02 -1.270 0.204209
## `home_ownershipOTHER:addr_stateMA` 5.455e-03 9.454e-03 0.577 0.563970
## `home_ownershipOWN:addr_stateMA` 6.812e-04 1.482e-03 0.460 0.645726
## `home_ownershipRENT:addr_stateMA` -1.475e-03 1.105e-03 -1.335 0.181771
## `home_ownershipNONE:addr_stateMD` NA NA NA NA
## `home_ownershipOTHER:addr_stateMD` 9.303e-03 1.035e-02 0.899 0.368624
## `home_ownershipOWN:addr_stateMD` -8.501e-04 1.562e-03 -0.544 0.586267
## `home_ownershipRENT:addr_stateMD` -6.917e-04 1.128e-03 -0.613 0.539856
## `home_ownershipNONE:addr_stateMI` NA NA NA NA
## `home_ownershipOTHER:addr_stateMI` 1.415e-02 1.035e-02 1.368 0.171385
## `home_ownershipOWN:addr_stateMI` -9.393e-04 1.579e-03 -0.595 0.552028
## `home_ownershipRENT:addr_stateMI` -1.137e-03 1.225e-03 -0.928 0.353286
## `home_ownershipNONE:addr_stateMN` NA NA NA NA
## `home_ownershipOTHER:addr_stateMN` 2.286e-02 1.194e-02 1.915 0.055558 .
## `home_ownershipOWN:addr_stateMN` -2.103e-03 1.793e-03 -1.173 0.240745
## `home_ownershipRENT:addr_stateMN` 1.064e-03 1.222e-03 0.871 0.383868
## `home_ownershipNONE:addr_stateMO` NA NA NA NA
## `home_ownershipOTHER:addr_stateMO` 1.734e-02 1.194e-02 1.452 0.146432
## `home_ownershipOWN:addr_stateMO` 2.050e-03 1.676e-03 1.223 0.221365
## `home_ownershipRENT:addr_stateMO` 1.013e-03 1.222e-03 0.828 0.407413
## `home_ownershipNONE:addr_stateNC` NA NA NA NA
## `home_ownershipOTHER:addr_stateNC` -7.839e-03 1.194e-02 -0.657 0.511502
## `home_ownershipOWN:addr_stateNC` -2.269e-03 1.653e-03 -1.373 0.169725
## `home_ownershipRENT:addr_stateNC` -1.745e-03 1.182e-03 -1.476 0.139848
## `home_ownershipNONE:addr_stateNJ` NA NA NA NA
## `home_ownershipOTHER:addr_stateNJ` 6.196e-03 9.758e-03 0.635 0.525456
## `home_ownershipOWN:addr_stateNJ` -3.040e-04 1.369e-03 -0.222 0.824298
## `home_ownershipRENT:addr_stateNJ` -5.709e-04 1.078e-03 -0.530 0.596283
## `home_ownershipNONE:addr_stateNV` NA NA NA NA
## `home_ownershipOTHER:addr_stateNV` 2.168e-02 1.194e-02 1.815 0.069463 .
## `home_ownershipOWN:addr_stateNV` 9.269e-04 2.359e-03 0.393 0.694406
## `home_ownershipRENT:addr_stateNV` -9.367e-04 1.257e-03 -0.745 0.456269
## `home_ownershipNONE:addr_stateNY` NA NA NA NA
## `home_ownershipOTHER:addr_stateNY` 1.160e-02 9.036e-03 1.283 0.199409
## `home_ownershipOWN:addr_stateNY` 1.560e-03 1.280e-03 1.219 0.222877
## `home_ownershipRENT:addr_stateNY` 7.188e-06 1.044e-03 0.007 0.994509
## `home_ownershipNONE:addr_stateOH` NA NA NA NA
## `home_ownershipOTHER:addr_stateOH` 2.668e-02 1.035e-02 2.578 0.009928 **
## `home_ownershipOWN:addr_stateOH` -4.146e-04 1.479e-03 -0.280 0.779178
## `home_ownershipRENT:addr_stateOH` -6.315e-04 1.116e-03 -0.566 0.571559
## `home_ownershipNONE:addr_stateOK` NA NA NA NA
## `home_ownershipOTHER:addr_stateOK` NA NA NA NA
## `home_ownershipOWN:addr_stateOK` -1.742e-03 2.062e-03 -0.845 0.398241
## `home_ownershipRENT:addr_stateOK` 6.136e-04 1.503e-03 0.408 0.683009
## `home_ownershipNONE:addr_stateOR` NA NA NA NA
## `home_ownershipOTHER:addr_stateOR` 6.573e-03 1.195e-02 0.550 0.582283
## `home_ownershipOWN:addr_stateOR` 2.671e-03 2.484e-03 1.075 0.282212
## `home_ownershipRENT:addr_stateOR` 1.351e-04 1.285e-03 0.105 0.916297
## `home_ownershipNONE:addr_statePA` NA NA NA NA
## `home_ownershipOTHER:addr_statePA` 1.352e-02 1.194e-02 1.132 0.257442
## `home_ownershipOWN:addr_statePA` 7.169e-05 1.385e-03 0.052 0.958726
## `home_ownershipRENT:addr_statePA` -2.175e-04 1.091e-03 -0.199 0.842014
## `home_ownershipNONE:addr_stateSC` NA NA NA NA
## `home_ownershipOTHER:addr_stateSC` 1.261e-02 1.194e-02 1.056 0.290966
## `home_ownershipOWN:addr_stateSC` 4.037e-04 1.755e-03 0.230 0.818019
## `home_ownershipRENT:addr_stateSC` 3.458e-04 1.322e-03 0.261 0.793743
## `home_ownershipNONE:addr_stateTX` NA NA NA NA
## `home_ownershipOTHER:addr_stateTX` 1.245e-02 9.130e-03 1.363 0.172768
## `home_ownershipOWN:addr_stateTX` 7.230e-05 1.292e-03 0.056 0.955382
## `home_ownershipRENT:addr_stateTX` -1.154e-03 1.050e-03 -1.099 0.271707
## `home_ownershipNONE:addr_stateUT` NA NA NA NA
## `home_ownershipOTHER:addr_stateUT` NA NA NA NA
## `home_ownershipOWN:addr_stateUT` 8.861e-04 3.457e-03 0.256 0.797688
## `home_ownershipRENT:addr_stateUT` 3.875e-04 1.477e-03 0.262 0.793018
## `home_ownershipNONE:addr_stateVA` NA NA NA NA
## `home_ownershipOTHER:addr_stateVA` 1.351e-02 9.130e-03 1.480 0.138923
## `home_ownershipOWN:addr_stateVA` -1.262e-04 1.490e-03 -0.085 0.932510
## `home_ownershipRENT:addr_stateVA` -1.287e-03 1.095e-03 -1.175 0.239882
## `home_ownershipNONE:addr_stateWA` NA NA NA NA
## `home_ownershipOTHER:addr_stateWA` NA NA NA NA
## `home_ownershipOWN:addr_stateWA` 2.358e-04 1.906e-03 0.124 0.901519
## `home_ownershipRENT:addr_stateWA` -2.917e-04 1.156e-03 -0.252 0.800792
## `home_ownershipNONE:addr_stateWI` NA NA NA NA
## `home_ownershipOTHER:addr_stateWI` NA NA NA NA
## `home_ownershipOWN:addr_stateWI` -4.566e-03 2.208e-03 -2.068 0.038613 *
## `home_ownershipRENT:addr_stateWI` -1.166e-03 1.280e-03 -0.911 0.362407
## `gradeB:inflation` 3.656e-03 1.860e-04 19.659 < 2e-16 ***
## `gradeC:inflation` 5.938e-03 1.978e-04 30.023 < 2e-16 ***
## `gradeD:inflation` 7.934e-03 2.310e-04 34.345 < 2e-16 ***
## `gradeE:inflation` 9.499e-03 3.006e-04 31.601 < 2e-16 ***
## `gradeF:inflation` 1.073e-02 5.058e-04 21.208 < 2e-16 ***
## `gradeG:inflation` 7.759e-03 8.704e-04 8.914 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008421 on 37719 degrees of freedom
## Multiple R-squared: 0.9491, Adjusted R-squared: 0.9489
## F-statistic: 4755 on 148 and 37719 DF, p-value: < 2.2e-16
anova(impr_model5$finalModel, inf_model1$finalModel)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
---|---|---|---|---|---|
3.77e+04 | 2.83 | ||||
3.77e+04 | 2.67 | 7 | 0.151 | 304 | 0 |
lc_with_bonds_inf<-lc_with_bonds_inf%>%
filter(home_ownership != "NONE")
set.seed(1234)
train_test_split <- initial_split(lc_with_bonds_inf, prop = 0.5)
training <- training(train_test_split)
testing <- testing(train_test_split)
control <- trainControl (
method="cv",
number=10,
verboseIter=F) #by setting this to true the model will report its progress after each estimation
lambda_seq <- seq(0, 0.01, length = 1000)
# lasso regression using k-fold cross validation to select the best lambda
lasso <- train(
int_rate ~ installment*poly(loan_amnt,3) + term+ dti + grade*price +grade:poly(loan_amnt,3):term +poly(loan_amnt,3):term +grade:term+home_ownership*addr_state*annual_inc+inflation*grade*price,
data = training,
method = "glmnet",
preProc = c("center", "scale"), #This option standardizes the data before running the LASSO regression
trControl = control,
tuneGrid = expand.grid(alpha = 1, lambda = lambda_seq) #alpha=1 specifies to run a LASSO regression. If alpha=0 the model would run ridge regression.
)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: home_ownershipOTHER:addr_stateIL,
## home_ownershipOTHER:addr_stateKY, home_ownershipOTHER:addr_stateLA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateIL:annual_inc,
## home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut
## = 19, uniqueCut = 10, : These variables have zero variances:
## home_ownershipOTHER:addr_stateKY, home_ownershipOTHER:addr_stateLA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: home_ownershipOTHER:addr_stateCT,
## home_ownershipOTHER:addr_stateKY, home_ownershipOTHER:addr_stateLA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateCT:annual_inc,
## home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: home_ownershipOTHER:addr_stateCO,
## home_ownershipOTHER:addr_stateKY, home_ownershipOTHER:addr_stateLA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateNV,
## home_ownershipOTHER:addr_stateOH, home_ownershipOTHER:addr_stateOK,
## home_ownershipOTHER:addr_stateOR, home_ownershipOTHER:addr_statePA,
## home_ownershipOTHER:addr_stateSC, home_ownershipOTHER:addr_stateUT,
## home_ownershipOTHER:addr_stateVA, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateCO:annual_inc,
## home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateNV:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateVA:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: home_ownershipOTHER:addr_stateKY,
## home_ownershipOTHER:addr_stateLA, home_ownershipOTHER:addr_stateMD,
## home_ownershipOTHER:addr_stateMN, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMN:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: home_ownershipOTHER:addr_stateKY,
## home_ownershipOTHER:addr_stateLA, home_ownershipOTHER:addr_stateMA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMI,
## home_ownershipOTHER:addr_stateMO, home_ownershipOTHER:addr_stateNC,
## home_ownershipOTHER:addr_stateNJ, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMI:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateNJ:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: home_ownershipOTHER:addr_stateKS,
## home_ownershipOTHER:addr_stateKY, home_ownershipOTHER:addr_stateLA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateKS:annual_inc,
## home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut
## = 19, uniqueCut = 10, : These variables have zero variances:
## home_ownershipOTHER:addr_stateKY, home_ownershipOTHER:addr_stateLA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: home_ownershipOTHER:addr_stateAZ,
## home_ownershipOTHER:addr_stateKY, home_ownershipOTHER:addr_stateLA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateAZ:annual_inc,
## home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut
## = 19, uniqueCut = 10, : These variables have zero variances:
## home_ownershipOTHER:addr_stateKY, home_ownershipOTHER:addr_stateLA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut
## = 19, uniqueCut = 10, : These variables have zero variances:
## home_ownershipOTHER:addr_stateKY, home_ownershipOTHER:addr_stateLA,
## home_ownershipOTHER:addr_stateMD, home_ownershipOTHER:addr_stateMO,
## home_ownershipOTHER:addr_stateNC, home_ownershipOTHER:addr_stateOH,
## home_ownershipOTHER:addr_stateOK, home_ownershipOTHER:addr_stateOR,
## home_ownershipOTHER:addr_statePA, home_ownershipOTHER:addr_stateSC,
## home_ownershipOTHER:addr_stateUT, home_ownershipOTHER:addr_stateWA,
## home_ownershipOTHER:addr_stateWI, home_ownershipOTHER:addr_stateKY:annual_inc,
## home_ownershipOTHER:addr_stateLA:annual_inc,
## home_ownershipOTHER:addr_stateMD:annual_inc,
## home_ownershipOTHER:addr_stateMO:annual_inc,
## home_ownershipOTHER:addr_stateNC:annual_inc,
## home_ownershipOTHER:addr_stateOH:annual_inc,
## home_ownershipOTHER:addr_stateOK:annual_inc,
## home_ownershipOTHER:addr_stateOR:annual_inc,
## home_ownershipOTHER:addr_statePA:annual_inc,
## home_ownershipOTHER:addr_stateSC:annual_inc,
## home_ownershipOTHER:addr_stateUT:annual_inc,
## home_ownershipOTHER:addr_stateWA:annual_inc,
## home_ownershipOTHER:addr_stateWI:annual_inc
# Model coefficients
coef(lasso$finalModel, lasso$bestTune$lambda)
## 321 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.204056e-01
## installment 4.115561e-01
## poly(loan_amnt, 3)1 -1.230138e-01
## poly(loan_amnt, 3)2 8.725921e-02
## poly(loan_amnt, 3)3 -4.488952e-02
## term60 8.376229e-02
## dti 4.614182e-05
## gradeB 6.224430e-03
## gradeC 6.461531e-03
## gradeD 7.015232e-03
## gradeE 5.023447e-03
## gradeF 2.876596e-03
## gradeG 1.638116e-03
## price 8.321902e-04
## home_ownershipOTHER .
## home_ownershipOWN -4.021051e-05
## home_ownershipRENT 1.953491e-04
## addr_stateAZ 2.381513e-05
## addr_stateCA 6.200061e-05
## addr_stateCO -4.978047e-05
## addr_stateCT -1.120084e-04
## addr_stateFL -1.273741e-05
## addr_stateGA .
## addr_stateIL -6.052498e-05
## addr_stateKS 4.294079e-05
## addr_stateKY 4.761425e-05
## addr_stateLA 1.479635e-05
## addr_stateMA 2.528222e-06
## addr_stateMD -2.046980e-05
## addr_stateMI -4.616264e-05
## addr_stateMN -6.902399e-05
## addr_stateMO -5.518226e-05
## addr_stateNC 8.918176e-05
## addr_stateNJ 7.068406e-05
## addr_stateNV 3.478825e-05
## addr_stateNY .
## addr_stateOH -2.203239e-05
## addr_stateOK 1.019713e-04
## addr_stateOR 7.743378e-05
## addr_statePA -6.217609e-05
## addr_stateSC 8.263521e-05
## addr_stateTX 1.508205e-04
## addr_stateUT 2.632515e-05
## addr_stateVA .
## addr_stateWA 1.593356e-05
## addr_stateWI -7.437448e-05
## annual_inc -6.540116e-06
## inflation -3.637369e-03
## installment:poly(loan_amnt, 3)1 -3.714802e-01
## installment:poly(loan_amnt, 3)2 1.020580e-01
## installment:poly(loan_amnt, 3)3 -2.576164e-04
## gradeB:price -3.428323e-03
## gradeC:price -2.205651e-03
## gradeD:price -2.218767e-03
## gradeE:price -7.590612e-04
## gradeF:price .
## gradeG:price 2.055998e-04
## poly(loan_amnt, 3)1:term60 2.658739e-02
## poly(loan_amnt, 3)2:term60 -2.100861e-02
## poly(loan_amnt, 3)3:term60 9.019366e-03
## term60:gradeB -6.590330e-04
## term60:gradeC -8.530498e-04
## term60:gradeD -1.104029e-03
## term60:gradeE -1.204616e-03
## term60:gradeF -7.792144e-04
## term60:gradeG -5.588835e-04
## home_ownershipOTHER:addr_stateAZ 3.196276e-05
## home_ownershipOWN:addr_stateAZ 8.385204e-07
## home_ownershipRENT:addr_stateAZ -1.234626e-05
## home_ownershipOTHER:addr_stateCA -1.460746e-04
## home_ownershipOWN:addr_stateCA -6.191672e-06
## home_ownershipRENT:addr_stateCA -7.379137e-05
## home_ownershipOTHER:addr_stateCO -2.086282e-05
## home_ownershipOWN:addr_stateCO 4.370118e-06
## home_ownershipRENT:addr_stateCO -1.542741e-06
## home_ownershipOTHER:addr_stateCT 6.802870e-06
## home_ownershipOWN:addr_stateCT 1.002743e-04
## home_ownershipRENT:addr_stateCT 1.388691e-04
## home_ownershipOTHER:addr_stateFL -4.929873e-05
## home_ownershipOWN:addr_stateFL 8.464146e-05
## home_ownershipRENT:addr_stateFL 1.379876e-05
## home_ownershipOTHER:addr_stateGA .
## home_ownershipOWN:addr_stateGA 4.875441e-05
## home_ownershipRENT:addr_stateGA 6.936153e-05
## home_ownershipOTHER:addr_stateIL -1.149628e-06
## home_ownershipOWN:addr_stateIL 1.288560e-04
## home_ownershipRENT:addr_stateIL 1.323931e-04
## home_ownershipOTHER:addr_stateKS 1.728499e-05
## home_ownershipOWN:addr_stateKS -3.771781e-05
## home_ownershipRENT:addr_stateKS -2.173264e-05
## home_ownershipOTHER:addr_stateKY .
## home_ownershipOWN:addr_stateKY .
## home_ownershipRENT:addr_stateKY -3.739498e-05
## home_ownershipOTHER:addr_stateLA .
## home_ownershipOWN:addr_stateLA -2.917501e-05
## home_ownershipRENT:addr_stateLA .
## home_ownershipOTHER:addr_stateMA 1.664977e-05
## home_ownershipOWN:addr_stateMA -8.005199e-06
## home_ownershipRENT:addr_stateMA -6.987510e-05
## home_ownershipOTHER:addr_stateMD .
## home_ownershipOWN:addr_stateMD 7.719732e-05
## home_ownershipRENT:addr_stateMD 1.847407e-05
## home_ownershipOTHER:addr_stateMI 1.231548e-07
## home_ownershipOWN:addr_stateMI 3.774470e-05
## home_ownershipRENT:addr_stateMI -5.298453e-05
## home_ownershipOTHER:addr_stateMN 1.494997e-07
## home_ownershipOWN:addr_stateMN 1.699172e-06
## home_ownershipRENT:addr_stateMN 2.225088e-04
## home_ownershipOTHER:addr_stateMO .
## home_ownershipOWN:addr_stateMO 3.901858e-05
## home_ownershipRENT:addr_stateMO 3.260785e-06
## home_ownershipOTHER:addr_stateNC .
## home_ownershipOWN:addr_stateNC -3.292687e-05
## home_ownershipRENT:addr_stateNC .
## home_ownershipOTHER:addr_stateNJ -1.477911e-07
## home_ownershipOWN:addr_stateNJ 1.916159e-05
## home_ownershipRENT:addr_stateNJ 6.947543e-05
## home_ownershipOTHER:addr_stateNV 5.551325e-05
## home_ownershipOWN:addr_stateNV 1.457761e-05
## home_ownershipRENT:addr_stateNV .
## home_ownershipOTHER:addr_stateNY 6.628160e-05
## home_ownershipOWN:addr_stateNY .
## home_ownershipRENT:addr_stateNY .
## home_ownershipOTHER:addr_stateOH .
## home_ownershipOWN:addr_stateOH 3.515603e-05
## home_ownershipRENT:addr_stateOH 6.057449e-05
## home_ownershipOTHER:addr_stateOK .
## home_ownershipOWN:addr_stateOK -4.072199e-05
## home_ownershipRENT:addr_stateOK 1.146268e-06
## home_ownershipOTHER:addr_stateOR .
## home_ownershipOWN:addr_stateOR 1.136830e-05
## home_ownershipRENT:addr_stateOR -4.414057e-05
## home_ownershipOTHER:addr_statePA .
## home_ownershipOWN:addr_statePA 6.770752e-05
## home_ownershipRENT:addr_statePA 1.234819e-05
## home_ownershipOTHER:addr_stateSC .
## home_ownershipOWN:addr_stateSC -1.206835e-04
## home_ownershipRENT:addr_stateSC 4.490288e-06
## home_ownershipOTHER:addr_stateTX 1.354214e-05
## home_ownershipOWN:addr_stateTX -7.701466e-06
## home_ownershipRENT:addr_stateTX -1.218740e-04
## home_ownershipOTHER:addr_stateUT .
## home_ownershipOWN:addr_stateUT -1.662737e-05
## home_ownershipRENT:addr_stateUT 8.435364e-05
## home_ownershipOTHER:addr_stateVA -1.533613e-07
## home_ownershipOWN:addr_stateVA 2.708428e-05
## home_ownershipRENT:addr_stateVA -1.037694e-04
## home_ownershipOTHER:addr_stateWA .
## home_ownershipOWN:addr_stateWA -3.523082e-05
## home_ownershipRENT:addr_stateWA 1.028599e-04
## home_ownershipOTHER:addr_stateWI .
## home_ownershipOWN:addr_stateWI 2.520819e-05
## home_ownershipRENT:addr_stateWI .
## home_ownershipOTHER:annual_inc .
## home_ownershipOWN:annual_inc -2.152167e-05
## home_ownershipRENT:annual_inc -8.114396e-06
## addr_stateAZ:annual_inc -1.023752e-05
## addr_stateCA:annual_inc .
## addr_stateCO:annual_inc 4.944447e-05
## addr_stateCT:annual_inc 1.240234e-05
## addr_stateFL:annual_inc .
## addr_stateGA:annual_inc -3.596060e-05
## addr_stateIL:annual_inc 9.495055e-05
## addr_stateKS:annual_inc -6.497676e-05
## addr_stateKY:annual_inc -5.892502e-05
## addr_stateLA:annual_inc -4.769239e-05
## addr_stateMA:annual_inc -7.235475e-06
## addr_stateMD:annual_inc 4.998360e-05
## addr_stateMI:annual_inc 1.030505e-04
## addr_stateMN:annual_inc -1.006432e-07
## addr_stateMO:annual_inc 3.764251e-05
## addr_stateNC:annual_inc -1.226296e-05
## addr_stateNJ:annual_inc 3.616753e-05
## addr_stateNV:annual_inc -1.329685e-06
## addr_stateNY:annual_inc -3.001721e-06
## addr_stateOH:annual_inc 5.792967e-05
## addr_stateOK:annual_inc -8.581138e-05
## addr_stateOR:annual_inc -3.096489e-05
## addr_statePA:annual_inc 5.045925e-05
## addr_stateSC:annual_inc -5.924825e-05
## addr_stateTX:annual_inc -6.985813e-05
## addr_stateUT:annual_inc -2.110511e-05
## addr_stateVA:annual_inc 6.451402e-05
## addr_stateWA:annual_inc 6.278008e-05
## addr_stateWI:annual_inc 1.220663e-05
## gradeB:inflation 1.427490e-03
## gradeC:inflation 3.467400e-03
## gradeD:inflation 2.079845e-03
## gradeE:inflation 1.232290e-03
## gradeF:inflation 8.101150e-04
## gradeG:inflation 1.152579e-04
## price:inflation 1.790835e-03
## poly(loan_amnt, 3)1:term36:gradeB -2.972859e-03
## poly(loan_amnt, 3)2:term36:gradeB 2.152156e-03
## poly(loan_amnt, 3)3:term36:gradeB -1.106535e-03
## poly(loan_amnt, 3)1:term60:gradeB -1.481417e-03
## poly(loan_amnt, 3)2:term60:gradeB 1.499541e-03
## poly(loan_amnt, 3)3:term60:gradeB -2.412059e-04
## poly(loan_amnt, 3)1:term36:gradeC -3.770313e-03
## poly(loan_amnt, 3)2:term36:gradeC 2.814496e-03
## poly(loan_amnt, 3)3:term36:gradeC -1.554475e-03
## poly(loan_amnt, 3)1:term60:gradeC -3.065874e-03
## poly(loan_amnt, 3)2:term60:gradeC 2.586478e-03
## poly(loan_amnt, 3)3:term60:gradeC -8.284202e-04
## poly(loan_amnt, 3)1:term36:gradeD -4.237501e-03
## poly(loan_amnt, 3)2:term36:gradeD 2.917238e-03
## poly(loan_amnt, 3)3:term36:gradeD -1.655473e-03
## poly(loan_amnt, 3)1:term60:gradeD -3.812423e-03
## poly(loan_amnt, 3)2:term60:gradeD 3.132671e-03
## poly(loan_amnt, 3)3:term60:gradeD -1.090118e-03
## poly(loan_amnt, 3)1:term36:gradeE -2.910973e-03
## poly(loan_amnt, 3)2:term36:gradeE 1.973013e-03
## poly(loan_amnt, 3)3:term36:gradeE -1.097280e-03
## poly(loan_amnt, 3)1:term60:gradeE -5.929336e-03
## poly(loan_amnt, 3)2:term60:gradeE 5.044649e-03
## poly(loan_amnt, 3)3:term60:gradeE -1.757503e-03
## poly(loan_amnt, 3)1:term36:gradeF -1.822268e-03
## poly(loan_amnt, 3)2:term36:gradeF 9.672291e-04
## poly(loan_amnt, 3)3:term36:gradeF -6.383266e-04
## poly(loan_amnt, 3)1:term60:gradeF -5.372785e-03
## poly(loan_amnt, 3)2:term60:gradeF 4.366152e-03
## poly(loan_amnt, 3)3:term60:gradeF -1.639044e-03
## poly(loan_amnt, 3)1:term36:gradeG -7.779189e-04
## poly(loan_amnt, 3)2:term36:gradeG 5.017104e-04
## poly(loan_amnt, 3)3:term36:gradeG -1.404546e-04
## poly(loan_amnt, 3)1:term60:gradeG -3.646725e-03
## poly(loan_amnt, 3)2:term60:gradeG 2.802203e-03
## poly(loan_amnt, 3)3:term60:gradeG -1.166074e-03
## home_ownershipOTHER:addr_stateAZ:annual_inc 2.449713e-07
## home_ownershipOWN:addr_stateAZ:annual_inc 1.167209e-05
## home_ownershipRENT:addr_stateAZ:annual_inc .
## home_ownershipOTHER:addr_stateCA:annual_inc 1.123740e-04
## home_ownershipOWN:addr_stateCA:annual_inc 4.016358e-05
## home_ownershipRENT:addr_stateCA:annual_inc -9.489654e-06
## home_ownershipOTHER:addr_stateCO:annual_inc -4.322584e-06
## home_ownershipOWN:addr_stateCO:annual_inc 2.006481e-05
## home_ownershipRENT:addr_stateCO:annual_inc -5.199360e-05
## home_ownershipOTHER:addr_stateCT:annual_inc .
## home_ownershipOWN:addr_stateCT:annual_inc -6.595563e-05
## home_ownershipRENT:addr_stateCT:annual_inc -3.621306e-05
## home_ownershipOTHER:addr_stateFL:annual_inc 4.836218e-05
## home_ownershipOWN:addr_stateFL:annual_inc -5.274012e-05
## home_ownershipRENT:addr_stateFL:annual_inc -1.735298e-05
## home_ownershipOTHER:addr_stateGA:annual_inc 2.338807e-05
## home_ownershipOWN:addr_stateGA:annual_inc 1.160667e-05
## home_ownershipRENT:addr_stateGA:annual_inc -1.258267e-05
## home_ownershipOTHER:addr_stateIL:annual_inc .
## home_ownershipOWN:addr_stateIL:annual_inc -2.966222e-05
## home_ownershipRENT:addr_stateIL:annual_inc -9.242133e-05
## home_ownershipOTHER:addr_stateKS:annual_inc 2.611862e-05
## home_ownershipOWN:addr_stateKS:annual_inc 4.278408e-06
## home_ownershipRENT:addr_stateKS:annual_inc -1.565041e-05
## home_ownershipOTHER:addr_stateKY:annual_inc .
## home_ownershipOWN:addr_stateKY:annual_inc 3.415290e-06
## home_ownershipRENT:addr_stateKY:annual_inc 2.888857e-05
## home_ownershipOTHER:addr_stateLA:annual_inc .
## home_ownershipOWN:addr_stateLA:annual_inc 6.074065e-05
## home_ownershipRENT:addr_stateLA:annual_inc 4.361700e-05
## home_ownershipOTHER:addr_stateMA:annual_inc .
## home_ownershipOWN:addr_stateMA:annual_inc 1.529995e-05
## home_ownershipRENT:addr_stateMA:annual_inc -9.238453e-06
## home_ownershipOTHER:addr_stateMD:annual_inc .
## home_ownershipOWN:addr_stateMD:annual_inc -1.435698e-05
## home_ownershipRENT:addr_stateMD:annual_inc -4.029609e-05
## home_ownershipOTHER:addr_stateMI:annual_inc 4.491816e-05
## home_ownershipOWN:addr_stateMI:annual_inc -5.035724e-05
## home_ownershipRENT:addr_stateMI:annual_inc 5.409820e-05
## home_ownershipOTHER:addr_stateMN:annual_inc 5.739514e-05
## home_ownershipOWN:addr_stateMN:annual_inc 2.008120e-06
## home_ownershipRENT:addr_stateMN:annual_inc -1.071684e-04
## home_ownershipOTHER:addr_stateMO:annual_inc .
## home_ownershipOWN:addr_stateMO:annual_inc 8.029634e-08
## home_ownershipRENT:addr_stateMO:annual_inc .
## home_ownershipOTHER:addr_stateNC:annual_inc .
## home_ownershipOWN:addr_stateNC:annual_inc -2.123761e-05
## home_ownershipRENT:addr_stateNC:annual_inc -8.025617e-05
## home_ownershipOTHER:addr_stateNJ:annual_inc -2.476715e-05
## home_ownershipOWN:addr_stateNJ:annual_inc .
## home_ownershipRENT:addr_stateNJ:annual_inc -1.236532e-04
## home_ownershipOTHER:addr_stateNV:annual_inc 3.733665e-07
## home_ownershipOWN:addr_stateNV:annual_inc .
## home_ownershipRENT:addr_stateNV:annual_inc 1.223902e-05
## home_ownershipOTHER:addr_stateNY:annual_inc -5.884191e-05
## home_ownershipOWN:addr_stateNY:annual_inc 6.654452e-05
## home_ownershipRENT:addr_stateNY:annual_inc 2.349599e-05
## home_ownershipOTHER:addr_stateOH:annual_inc .
## home_ownershipOWN:addr_stateOH:annual_inc .
## home_ownershipRENT:addr_stateOH:annual_inc -1.017848e-05
## home_ownershipOTHER:addr_stateOK:annual_inc .
## home_ownershipOWN:addr_stateOK:annual_inc 3.083452e-05
## home_ownershipRENT:addr_stateOK:annual_inc -9.251573e-06
## home_ownershipOTHER:addr_stateOR:annual_inc .
## home_ownershipOWN:addr_stateOR:annual_inc 2.900924e-05
## home_ownershipRENT:addr_stateOR:annual_inc 1.878517e-06
## home_ownershipOTHER:addr_statePA:annual_inc .
## home_ownershipOWN:addr_statePA:annual_inc -7.060510e-05
## home_ownershipRENT:addr_statePA:annual_inc -1.312299e-07
## home_ownershipOTHER:addr_stateSC:annual_inc .
## home_ownershipOWN:addr_stateSC:annual_inc 9.212776e-05
## home_ownershipRENT:addr_stateSC:annual_inc -3.813217e-05
## home_ownershipOTHER:addr_stateTX:annual_inc -1.129625e-05
## home_ownershipOWN:addr_stateTX:annual_inc .
## home_ownershipRENT:addr_stateTX:annual_inc 3.147165e-05
## home_ownershipOTHER:addr_stateUT:annual_inc .
## home_ownershipOWN:addr_stateUT:annual_inc 3.188550e-07
## home_ownershipRENT:addr_stateUT:annual_inc -6.833735e-05
## home_ownershipOTHER:addr_stateVA:annual_inc -5.257542e-06
## home_ownershipOWN:addr_stateVA:annual_inc -2.415407e-05
## home_ownershipRENT:addr_stateVA:annual_inc 7.975434e-05
## home_ownershipOTHER:addr_stateWA:annual_inc .
## home_ownershipOWN:addr_stateWA:annual_inc 3.016251e-05
## home_ownershipRENT:addr_stateWA:annual_inc -1.356471e-04
## home_ownershipOTHER:addr_stateWI:annual_inc .
## home_ownershipOWN:addr_stateWI:annual_inc 1.169977e-06
## home_ownershipRENT:addr_stateWI:annual_inc 2.799035e-05
## gradeB:price:inflation -4.002137e-04
## gradeC:price:inflation -2.541590e-03
## gradeD:price:inflation -1.131275e-03
## gradeE:price:inflation -5.154445e-04
## gradeF:price:inflation -4.730201e-04
## gradeG:price:inflation .
# Best lambda
lasso$bestTune$lambda
## [1] 0
# Count of how many coefficients are greater than zero and how many are equal to zero
sum(coef(lasso$finalModel, lasso$bestTune$lambda)!=0)
## [1] 269
sum(coef(lasso$finalModel, lasso$bestTune$lambda)==0)
## [1] 52
# Make predictions
predictions <- predict(lasso,testing)
# Model prediction performance
data.frame(
RMSE = RMSE(predictions, testing$int_rate),
Rsquare = R2(predictions, testing$int_rate)
)
RMSE | Rsquare |
---|---|
0.00413 | 0.988 |
Does the Additional Data Make a Difference?
The interest rate is slightly correlated with inflation if segmented within grades. This means that adding inflation to the model helps produce a slight increase in prediction accuracy within both the linear model and the lasso regression, but it is not of a large amount as the correlation is weak.
Comment and explain each row of the code in the chunk below.
The the accuracy of the model when testing on the training and test set changes less then 1% which indicates a good model and no over-fitting. The set seed affects the final values as it determines the randomness at which the data is split into training and test set.