Data preparation

Let’s use the Boston dataset from the MASS package, and suppose we want to predict the median value of houses (last variable) based on the other available variables.

Variable Description
crim per capita crime rate by town
zn proportion of residential land zoned for lots over 25,000 sq.ft.
indus proportion of non-retail business acres per town
chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
nox nitrogen oxides concentration (parts per 10 million)
rm average number of rooms per dwelling
age proportion of owner-occupied units built prior to 1940
dis weighted mean of distances to five Boston employment centres
rad index of accessibility to radial highways
tax full-value property-tax rate per $10,000
pt ratio pupil-teacher ratio by town
black \(1000(Bk−0.63)^2\) where \(Bk\) is the proportion of blacks by town
lstat lower status of the population (percent)
medv median value of owner-occupied homes in $1000s

Separate the data into train and test datasets, with 75% of the data in the training dataset, sampled on our dependent variable medv:

data <- MASS::Boston

index <- createDataPartition(data$medv, p = 0.75, list = FALSE, times = 1)
train <- data[index,]
test  <- data[-index,]

Linear regression model for comparison

Calculate the linear regression model on the training set.

lm.fit <- lm(medv ~ ., data=train)
sjPlot::tab_model(lm.fit)
  medv
Predictors Estimates CI p
(Intercept) 44.51 32.35 – 56.67 <0.001
crim -0.07 -0.16 – 0.02 0.119
zn 0.06 0.03 – 0.09 <0.001
indus -0.01 -0.15 – 0.12 0.845
chas 3.02 1.09 – 4.96 0.002
nox -20.41 -29.38 – -11.43 <0.001
rm 3.09 2.06 – 4.12 <0.001
age 0.02 -0.01 – 0.05 0.201
dis -1.61 -2.10 – -1.13 <0.001
rad 0.32 0.16 – 0.47 <0.001
tax -0.01 -0.02 – -0.00 0.010
ptratio -1.04 -1.34 – -0.75 <0.001
black 0.01 0.00 – 0.02 0.003
lstat -0.64 -0.77 – -0.52 <0.001
Observations 381
R2 / adjusted R2 0.754 / 0.745

Make predictions on the testing set, and calculate the mean standard error:

pr.lm <- predict(lm.fit, test)
MSE.lm <- sum((pr.lm - test$medv)^2)/nrow(test)
MSE.lm
## [1] 22.89701

Set up a neural network

Step 1: Normalize the data

norm.data <- data %>% 
  # calculate the pre-process parameters from the dataset
  preProcess(method = c("range")) %>% 
  # transform the dataset based on the pre-processing parameters
  predict(data)

# summarize the transformed dataset to check that it worked out well
summary(norm.data) %>% t() %>% knitr::kable()
crim Min. :0.0000000 1st Qu.:0.0008511 Median :0.0028121 Mean :0.0405441 3rd Qu.:0.0412585 Max. :1.0000000
zn Min. :0.0000 1st Qu.:0.0000 Median :0.0000 Mean :0.1136 3rd Qu.:0.1250 Max. :1.0000
indus Min. :0.0000 1st Qu.:0.1734 Median :0.3383 Mean :0.3914 3rd Qu.:0.6466 Max. :1.0000
chas Min. :0.00000 1st Qu.:0.00000 Median :0.00000 Mean :0.06917 3rd Qu.:0.00000 Max. :1.00000
nox Min. :0.0000 1st Qu.:0.1317 Median :0.3148 Mean :0.3492 3rd Qu.:0.4918 Max. :1.0000
rm Min. :0.0000 1st Qu.:0.4454 Median :0.5073 Mean :0.5219 3rd Qu.:0.5868 Max. :1.0000
age Min. :0.0000 1st Qu.:0.4338 Median :0.7683 Mean :0.6764 3rd Qu.:0.9390 Max. :1.0000
dis Min. :0.00000 1st Qu.:0.08826 Median :0.18895 Mean :0.24238 3rd Qu.:0.36909 Max. :1.00000
rad Min. :0.0000 1st Qu.:0.1304 Median :0.1739 Mean :0.3717 3rd Qu.:1.0000 Max. :1.0000
tax Min. :0.0000 1st Qu.:0.1756 Median :0.2729 Mean :0.4222 3rd Qu.:0.9141 Max. :1.0000
ptratio Min. :0.0000 1st Qu.:0.5106 Median :0.6862 Mean :0.6229 3rd Qu.:0.8085 Max. :1.0000
black Min. :0.0000 1st Qu.:0.9457 Median :0.9862 Mean :0.8986 3rd Qu.:0.9983 Max. :1.0000
lstat Min. :0.0000 1st Qu.:0.1440 Median :0.2657 Mean :0.3014 3rd Qu.:0.4201 Max. :1.0000
medv Min. :0.0000 1st Qu.:0.2672 Median :0.3600 Mean :0.3896 3rd Qu.:0.4444 Max. :1.0000
# rebuild training and testing datasets, but normalized
norm.train <- norm.data[index,]
norm.test  <- norm.data[-index,]

Step 2: Create and train your neural net

The parameter hidden sets up how many interior layers there are and how many neurons are inside each layer.

nn <- neuralnet(medv ~ ., data = norm.train, hidden=c(5,3), linear.output=TRUE)
plot(nn, rep = "best")

Step 3: Use the trained network to make predictions on the test dataset

pr.nn <- predict(nn, newdata = norm.test %>% select(-medv))

De-normalize the predictions to compare them with the actual data, and calculate the mean standard error:

pr.nn_ <- pr.nn * (max(data$medv) - min(data$medv)) + min(data$medv)
test.r <- norm.test$medv * (max(data$medv) - min(data$medv)) + min(data$medv)

MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(norm.test)
MSE.nn
## [1] 12.12083

Compare linear regression to neural net

Linear Model MSE Neural Net MSE
22.8970055 12.1208343

Vizualize the difference:

data.frame(
  real = test$medv,
  `Neuronal Net` = pr.nn_,
  `Linear Model` = pr.lm
) %>% 
  gather(key = "variable", value = "Prediction", -real) %>% 
  ggplot() +
    aes(x = real, y = Prediction) +
    geom_abline() +
    geom_point() +
    facet_wrap(~variable) +
    labs(x = "Observed value",
         title = "Performance comparison between methods",
         caption = "Straight line is the 45-degree line") +
    ggthemes::theme_tufte(base_size = 16)

Notice that the points are closer to the 45-degree line for the neural net than for the linear model, indicating the better prediction. This is the case especially for high values of the house prices, where the linear regression systematically under-predicts, while the neural net works well.

Repeat the analysis many times

# repeat analysis 10 times
MSE <- map_df(1:10, ~{
  
  # create datasets
  index <- createDataPartition(data$medv, p = 0.75, list = FALSE, times = 1)
  train <- data[index, ]
  test  <- data[-index, ]
  
  # the linear regression
  lm.fit <- lm(medv ~ ., data=train)
  pr.lm <- predict(lm.fit, test)
  MSE.lm <- sum((pr.lm - test$medv)^2)/nrow(test)
  
  # normalize datasets
  norm.data <- data %>% 
    preProcess(method = c("range")) %>% 
    predict(data)
  norm.train <- norm.data[index,]
  norm.test  <- norm.data[-index,]
  
  # train the neural net
  nn <- neuralnet(medv ~ ., data = norm.train, hidden=c(5,3), linear.output=TRUE)
  
  # make prediction
  pr.nn <- predict(nn, newdata = norm.test %>% select(-medv))
  
  # de-normalize and compare to observed data
  pr.nn_ <- pr.nn * (max(data$medv) - min(data$medv)) + min(data$medv)
  test.r <- norm.test$medv * (max(data$medv) - min(data$medv)) + min(data$medv)

  MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(norm.test)
  
  return(data.frame(MSE.lm, MSE.nn))
  
  })

MSE %>% knitr::kable()
MSE.lm MSE.nn
18.83103 10.924699
24.99424 10.667461
21.48697 13.980639
25.47948 24.908207
17.59574 10.003433
17.64352 8.836584
18.22807 10.185332
25.00585 10.826451
26.86839 30.021057
21.07556 12.285226
MSE %>% map_dbl(mean) %>% t() %>% knitr::kable()
MSE.lm MSE.nn
21.72088 14.26391

The neural net indeed has systematically better predictive power than the linear regression.

Compare different network architectures

The internal structure of the network generally affects its performance. The total number of neurons in the network is not the only thing that matters. Let us explore a few network configurations with 8 neurons:

# create normalized dataset
norm.data <- data %>% 
  preProcess(method = c("range")) %>% 
  predict(data)

# repeat 100 times on 100 different test/train datasets
MSE <- map(1:100, ~{

  # create training and testing datasets
  index <- createDataPartition(data$medv, p = 0.75, list = FALSE, times = 1)
  norm.train <- norm.data[index,]
  norm.test  <- norm.data[-index,]
  
  # try out different network architectures
  MSE.nn.vector <- 
    list(
      c(2,2,2,2),
      c(2,4,2),
      c(4,4),
      c(8)
      ) %>% 
    map_dbl(function(architecture) {
      
      # train the neural net
      nn <- neuralnet(medv ~ ., data = norm.train, 
                      hidden=architecture, linear.output=TRUE)
  
      # make prediction
      pr.nn <- predict(nn, newdata = norm.test %>% select(-medv))
  
      # de-normalize and compare to observed data
      pr.nn_ <- pr.nn * (max(data$medv) - min(data$medv)) + min(data$medv)
      test.r <- norm.test$medv * (max(data$medv) - min(data$medv)) + min(data$medv)

      MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(norm.test)  
      
      return(MSE.nn)
    })

  return(MSE.nn.vector)
  
  })

# turn list into dataframe
MSE <- do.call(rbind.data.frame, MSE) %>% mutate_all(round,3)
colnames(MSE) <- c("a2222", "a242", "a44", "a8")

MSE %>% DT::datatable()

Calculate the average performance for each architecture:

MSE.means <- data.frame(
    mean  = map_dbl(MSE, mean),
    error = map_dbl(MSE, ~{qt(0.975, df = nrow(MSE)-1) * sd(.x)/sqrt(nrow(MSE))})
  ) %>% rownames_to_column("architecture")

MSE.means %>% 
  ggplot() +
    aes(x = architecture, y = mean, label = round(mean,1)) +
    geom_bar(stat="identity") +
    geom_errorbar(aes(ymin=mean-error, ymax=mean+error), width=.2) +
    geom_label() +
    labs(title = "Performance differences between different network architectures",
         caption = "Error bars show the 95% confidence intervals") +
    ggthemes::theme_tufte(base_size = 16)