library(tidyverse)
The data in the following URL “http://www-bcf.usc.edu/~gareth/ISL/Income1.csv” includes observation on income levels (in tens of thousands of dollars) and years of education. The data is not real and was actually simulated.
income <- read_csv("http://www-bcf.usc.edu/~gareth/ISL/Income1.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_integer(),
Education = col_double(),
Income = col_double()
)
income <- income %>% select(-X1)
Error in select(., -X1) : unused argument (-X1)
r
r ggplot(income, aes(x = Education, y = Income)) + geom_point() + geom_smooth(method = )
r
r set.seed(12345) idx <- sample(1:nrow(income), floor(0.7 * nrow(income))) income_train <- income[idx, ] income_test <- income[-idx, ]
summary()
r
r fit <- lm(data = income_train, formula = Income ~ Education) summary(fit)
Call:
lm(formula = Income ~ Education, data = income_train)
Residuals:
Min 1Q Median 3Q Max
-11.7076 -4.2127 0.8739 3.6296 11.9906
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -43.5026 6.3357 -6.866 1.50e-06 ***
Education 5.8171 0.3829 15.190 4.41e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.149 on 19 degrees of freedom
Multiple R-squared: 0.9239, Adjusted R-squared: 0.9199
F-statistic: 230.7 on 1 and 19 DF, p-value: 4.408e-12
r
r coef(fit)
(Intercept) Education
-43.502593 5.817084
r
r library(modelr) income_train <- income_train %>% add_predictions(fit) income_train
[38;5;246m# A tibble: 21 x 3[39m
Education Income pred
[3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m 18.7 69.0 65.3
[38;5;250m 2[39m 20.4 75.8 74.9
[38;5;250m 3[39m 22 80.3 84.5
[38;5;250m 4[39m 19.5 71.9 70.0
[38;5;250m 5[39m 14.5 41.5 41.0
[38;5;250m 6[39m 11.6 15.2 24.2
[38;5;250m 7[39m 12.9 25.5 31.5
[38;5;250m 8[39m 21.6 72.1 82.1
[38;5;250m 9[39m 16.6 51.5 53.2
[38;5;250m10[39m 18.3 64.3 62.8
[38;5;246m# ... with 11 more rows[39m
r
r newobs <- data.frame(Education = c(16.00, 12.52, 15.55, 21.09, 18.36)) newobs <- newobs %>% add_predictions(fit) newobs
Education pred
1 16.00 49.57076
2 12.52 29.32730
3 15.55 46.95307
4 21.09 79.17972
5 18.36 63.29908
r
r income_test <- income_test %>% add_predictions(fit) (RMSE <- sqrt(mean((income_test\(Income - income_test\)pred)^2)))
[1] 4.767779
r
r income2 <- read_csv(://www-bcf.usc.edu/~gareth/ISL/Income2.csv)
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_integer(),
Education = col_double(),
Seniority = col_double(),
Income = col_double()
)
r
r (income2 <- income2 %>% select(-X1))
[38;5;246m# A tibble: 30 x 3[39m
Education Seniority Income
[3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m 21.6 113. 99.9
[38;5;250m 2[39m 18.3 119. 92.6
[38;5;250m 3[39m 12.1 101. 34.7
[38;5;250m 4[39m 17.0 188. 78.7
[38;5;250m 5[39m 19.9 20 68.0
[38;5;250m 6[39m 18.3 26.2 71.5
[38;5;250m 7[39m 19.9 150. 88.0
[38;5;250m 8[39m 21.2 82.1 79.8
[38;5;250m 9[39m 20.3 88.3 90.0
[38;5;250m10[39m 10 113. 45.7
[38;5;246m# ... with 20 more rows[39m
r
r set.seed(12345) idx <- sample(1:nrow(income2), floor(0.7 * nrow(income))) income2_train <- income2[idx, ] income2_test <- income2[-idx, ]
r
r mfit <- lm(data = income2_train, formula = Income ~ Education + Seniority) summary(mfit)
Call:
lm(formula = Income ~ Education + Seniority, data = income2_train)
Residuals:
Min 1Q Median 3Q Max
-9.538 -4.833 -0.184 3.283 12.358
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -58.13205 6.52578 -8.908 5.13e-08 ***
Education 6.32393 0.39571 15.981 4.44e-12 ***
Seniority 0.16557 0.02644 6.261 6.63e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.035 on 18 degrees of freedom
Multiple R-squared: 0.9558, Adjusted R-squared: 0.9509
F-statistic: 194.7 on 2 and 18 DF, p-value: 6.419e-13
r
r (income2_train <- income2_train %>% add_predictions(mfit))
[38;5;246m# A tibble: 21 x 4[39m
Education Seniority Income pred
[3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m 14.6 138. 53.5 56.7
[38;5;250m 2[39m 11.2 44.8 21.4 20.4
[38;5;250m 3[39m 17.0 107. 74.6 67.3
[38;5;250m 4[39m 10.4 32.4 18.6 13.1
[38;5;250m 5[39m 18.7 144. 96.3 83.9
[38;5;250m 6[39m 19.9 20 68.0 71.2
[38;5;250m 7[39m 21.2 82.1 79.8 89.3
[38;5;250m 8[39m 12.1 32.4 17.6 23.6
[38;5;250m 9[39m 14.1 20 28.8 34.6
[38;5;250m10[39m 18.3 101. 74.7 74.1
[38;5;246m# ... with 11 more rows[39m
r
r income2_test <- income2_test %>% add_predictions(mfit) (RMSE_mfit <- sqrt(mean((income2_test\(Income - income2_test\)pred)^2)))
[1] 10.00648
Note that since we don’t have the validation test, we cannot use these RMSE to choose between the two models. If you were to do that, you would need to first start with the same dataset, and split them into three parts, use RMSE on a validation set to pick between the two models, and evaluate the performance of the chosen model on the test set separately.
In this exercise you will perform Lasso regression yourself. We will use the Boston
dataset from the MASS
package. The dataset contains information on the Boston suburbs housing market collected by David Harrison in 1978.
We will try to predict the median value of of homes in the region based on its attributes recorded in other variables.
First install the package:
r
r # install.packages()
r
r # We are not loading the entire library MASS because we are using only # one dataset (also MASS has a select function which is in conflict with # dplyr::select) Boston <- MASS::Boston head(Boston, 3)
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
r
r str(Boston)
'data.frame': 506 obs. of 14 variables:
$ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
$ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
$ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
$ chas : int 0 0 0 0 0 0 0 0 0 0 ...
$ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
$ rm : num 6.58 6.42 7.18 7 7.15 ...
$ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
$ dis : num 4.09 4.97 4.97 6.06 6.06 ...
$ rad : int 1 2 2 3 3 3 5 5 5 5 ...
$ tax : num 296 242 242 222 222 222 311 311 311 311 ...
$ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
$ black : num 397 397 393 395 397 ...
$ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
$ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
r
r set.seed(123) idx <- sample(1:nrow(Boston), round(0.7 * nrow(Boston))) boston_train <- Boston[idx, ] boston_test <- Boston[-idx, ]
glmnet
. Steps:Boston
data.frame
and convert them if necessary to a correct format.r
r X_train <- data.matrix(boston_train[, -ncol(boston_train)]) y_train <- boston_train\(medv X_test <- data.matrix(boston_test[, -ncol(boston_test)]) y_test <- boston_test\)medv
r
r library(glmnet)
Loading required package: Matrix
Attaching package: ‘Matrix’
The following object is masked from ‘package:tidyr’:
expand
Loading required package: foreach
Attaching package: ‘foreach’
The following objects are masked from ‘package:purrr’:
accumulate, when
Loaded glmnet 2.0-16
r
r cvfit <- cv.glmnet(X_train, y_train, nfolds = 5) plot(cvfit)
lambda.min
and lambda.1se
.r
r cvfit$lambda.min
[1] 0.03383821
r
r cvfit$lambda.1se
[1] 0.3155763
lambda.min
and lambda.1se
. Evaluate the MSE for each.r
r final_pred <- predict(cvfit, newx=X_test, s=.1se) head(final_pred)
1
3 31.29388
6 26.56063
8 19.69023
9 11.02580
10 19.86751
18 18.23750
r
r (RMSE <- sqrt(mean((final_pred - y_test)^2)))
[1] 4.700897