Neural Networks

Data preparation

Let’s use the Boston dataset from the MASS package, and suppose we want to predict the median value of houses (last variable) based on the other available variables.

Variable	Description
`crim`	per capita crime rate by town
`zn`	proportion of residential land zoned for lots over 25,000 sq.ft.
`indus`	proportion of non-retail business acres per town
`chas`	Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
`nox`	nitrogen oxides concentration (parts per 10 million)
`rm`	average number of rooms per dwelling
`age`	proportion of owner-occupied units built prior to 1940
`dis`	weighted mean of distances to five Boston employment centres
`rad`	index of accessibility to radial highways
`tax`	full-value property-tax rate per $10,000
`pt`	ratio pupil-teacher ratio by town
`black`	$1000(Bk−0.63)^2$ where $Bk$ is the proportion of blacks by town
`lstat`	lower status of the population (percent)
`medv`	median value of owner-occupied homes in $1000s

Separate the data into train and test datasets, with 75% of the data in the training dataset, sampled on our dependent variable medv:

data <- MASS::Boston

index <- createDataPartition(data$medv, p = 0.75, list = FALSE, times = 1)
train <- data[index,]
test  <- data[-index,]

Linear regression model for comparison

Calculate the linear regression model on the training set.

lm.fit <- lm(medv ~ ., data=train)
sjPlot::tab_model(lm.fit)

	medv
Predictors	Estimates	CI	p
(Intercept)	44.51	32.35 – 56.67	<0.001
crim	-0.07	-0.16 – 0.02	0.119
zn	0.06	0.03 – 0.09	<0.001
indus	-0.01	-0.15 – 0.12	0.845
chas	3.02	1.09 – 4.96	0.002
nox	-20.41	-29.38 – -11.43	<0.001
rm	3.09	2.06 – 4.12	<0.001
age	0.02	-0.01 – 0.05	0.201
dis	-1.61	-2.10 – -1.13	<0.001
rad	0.32	0.16 – 0.47	<0.001
tax	-0.01	-0.02 – -0.00	0.010
ptratio	-1.04	-1.34 – -0.75	<0.001
black	0.01	0.00 – 0.02	0.003
lstat	-0.64	-0.77 – -0.52	<0.001
Observations	381
R² / adjusted R²	0.754 / 0.745

Make predictions on the testing set, and calculate the mean standard error:

pr.lm <- predict(lm.fit, test)
MSE.lm <- sum((pr.lm - test$medv)^2)/nrow(test)
MSE.lm

## [1] 22.89701

Set up a neural network

Step 1: Normalize the data

norm.data <- data %>% 
  # calculate the pre-process parameters from the dataset
  preProcess(method = c("range")) %>% 
  # transform the dataset based on the pre-processing parameters
  predict(data)

# summarize the transformed dataset to check that it worked out well
summary(norm.data) %>% t() %>% knitr::kable()

crim	Min. :0.0000000	1st Qu.:0.0008511	Median :0.0028121	Mean :0.0405441	3rd Qu.:0.0412585	Max. :1.0000000
zn	Min. :0.0000	1st Qu.:0.0000	Median :0.0000	Mean :0.1136	3rd Qu.:0.1250	Max. :1.0000
indus	Min. :0.0000	1st Qu.:0.1734	Median :0.3383	Mean :0.3914	3rd Qu.:0.6466	Max. :1.0000
chas	Min. :0.00000	1st Qu.:0.00000	Median :0.00000	Mean :0.06917	3rd Qu.:0.00000	Max. :1.00000
nox	Min. :0.0000	1st Qu.:0.1317	Median :0.3148	Mean :0.3492	3rd Qu.:0.4918	Max. :1.0000
rm	Min. :0.0000	1st Qu.:0.4454	Median :0.5073	Mean :0.5219	3rd Qu.:0.5868	Max. :1.0000
age	Min. :0.0000	1st Qu.:0.4338	Median :0.7683	Mean :0.6764	3rd Qu.:0.9390	Max. :1.0000
dis	Min. :0.00000	1st Qu.:0.08826	Median :0.18895	Mean :0.24238	3rd Qu.:0.36909	Max. :1.00000
rad	Min. :0.0000	1st Qu.:0.1304	Median :0.1739	Mean :0.3717	3rd Qu.:1.0000	Max. :1.0000
tax	Min. :0.0000	1st Qu.:0.1756	Median :0.2729	Mean :0.4222	3rd Qu.:0.9141	Max. :1.0000
ptratio	Min. :0.0000	1st Qu.:0.5106	Median :0.6862	Mean :0.6229	3rd Qu.:0.8085	Max. :1.0000
black	Min. :0.0000	1st Qu.:0.9457	Median :0.9862	Mean :0.8986	3rd Qu.:0.9983	Max. :1.0000
lstat	Min. :0.0000	1st Qu.:0.1440	Median :0.2657	Mean :0.3014	3rd Qu.:0.4201	Max. :1.0000
medv	Min. :0.0000	1st Qu.:0.2672	Median :0.3600	Mean :0.3896	3rd Qu.:0.4444	Max. :1.0000

# rebuild training and testing datasets, but normalized
norm.train <- norm.data[index,]
norm.test  <- norm.data[-index,]

Step 2: Create and train your neural net

The parameter hidden sets up how many interior layers there are and how many neurons are inside each layer.

nn <- neuralnet(medv ~ ., data = norm.train, hidden=c(5,3), linear.output=TRUE)
plot(nn, rep = "best")

Step 3: Use the trained network to make predictions on the test dataset

pr.nn <- predict(nn, newdata = norm.test %>% select(-medv))

De-normalize the predictions to compare them with the actual data, and calculate the mean standard error:

pr.nn_ <- pr.nn * (max(data$medv) - min(data$medv)) + min(data$medv)
test.r <- norm.test$medv * (max(data$medv) - min(data$medv)) + min(data$medv)

MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(norm.test)
MSE.nn

## [1] 12.12083

Compare linear regression to neural net

Linear Model MSE	Neural Net MSE
22.8970055	12.1208343

Vizualize the difference:

data.frame(
  real = test$medv,
  `Neuronal Net` = pr.nn_,
  `Linear Model` = pr.lm
) %>% 
  gather(key = "variable", value = "Prediction", -real) %>% 
  ggplot() +
    aes(x = real, y = Prediction) +
    geom_abline() +
    geom_point() +
    facet_wrap(~variable) +
    labs(x = "Observed value",
         title = "Performance comparison between methods",
         caption = "Straight line is the 45-degree line") +
    ggthemes::theme_tufte(base_size = 16)

Notice that the points are closer to the 45-degree line for the neural net than for the linear model, indicating the better prediction. This is the case especially for high values of the house prices, where the linear regression systematically under-predicts, while the neural net works well.

Repeat the analysis many times

# repeat analysis 10 times
MSE <- map_df(1:10, ~{
  
  # create datasets
  index <- createDataPartition(data$medv, p = 0.75, list = FALSE, times = 1)
  train <- data[index, ]
  test  <- data[-index, ]
  
  # the linear regression
  lm.fit <- lm(medv ~ ., data=train)
  pr.lm <- predict(lm.fit, test)
  MSE.lm <- sum((pr.lm - test$medv)^2)/nrow(test)
  
  # normalize datasets
  norm.data <- data %>% 
    preProcess(method = c("range")) %>% 
    predict(data)
  norm.train <- norm.data[index,]
  norm.test  <- norm.data[-index,]
  
  # train the neural net
  nn <- neuralnet(medv ~ ., data = norm.train, hidden=c(5,3), linear.output=TRUE)
  
  # make prediction
  pr.nn <- predict(nn, newdata = norm.test %>% select(-medv))
  
  # de-normalize and compare to observed data
  pr.nn_ <- pr.nn * (max(data$medv) - min(data$medv)) + min(data$medv)
  test.r <- norm.test$medv * (max(data$medv) - min(data$medv)) + min(data$medv)

  MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(norm.test)
  
  return(data.frame(MSE.lm, MSE.nn))
  
  })

MSE %>% knitr::kable()

MSE.lm	MSE.nn
18.83103	10.924699
24.99424	10.667461
21.48697	13.980639
25.47948	24.908207
17.59574	10.003433
17.64352	8.836584
18.22807	10.185332
25.00585	10.826451
26.86839	30.021057
21.07556	12.285226

MSE %>% map_dbl(mean) %>% t() %>% knitr::kable()

MSE.lm	MSE.nn
21.72088	14.26391

The neural net indeed has systematically better predictive power than the linear regression.

Compare different network architectures

The internal structure of the network generally affects its performance. The total number of neurons in the network is not the only thing that matters. Let us explore a few network configurations with 8 neurons:

4 hidden layers, 2 neurons each
3 hidden layers, 2-4-2
2 hidden layers, 4-4
1 hidden layer, 8 neurons

# create normalized dataset
norm.data <- data %>% 
  preProcess(method = c("range")) %>% 
  predict(data)

# repeat 100 times on 100 different test/train datasets
MSE <- map(1:100, ~{

  # create training and testing datasets
  index <- createDataPartition(data$medv, p = 0.75, list = FALSE, times = 1)
  norm.train <- norm.data[index,]
  norm.test  <- norm.data[-index,]
  
  # try out different network architectures
  MSE.nn.vector <- 
    list(
      c(2,2,2,2),
      c(2,4,2),
      c(4,4),
      c(8)
      ) %>% 
    map_dbl(function(architecture) {
      
      # train the neural net
      nn <- neuralnet(medv ~ ., data = norm.train, 
                      hidden=architecture, linear.output=TRUE)
  
      # make prediction
      pr.nn <- predict(nn, newdata = norm.test %>% select(-medv))
  
      # de-normalize and compare to observed data
      pr.nn_ <- pr.nn * (max(data$medv) - min(data$medv)) + min(data$medv)
      test.r <- norm.test$medv * (max(data$medv) - min(data$medv)) + min(data$medv)

      MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(norm.test)  
      
      return(MSE.nn)
    })

  return(MSE.nn.vector)
  
  })

# turn list into dataframe
MSE <- do.call(rbind.data.frame, MSE) %>% mutate_all(round,3)
colnames(MSE) <- c("a2222", "a242", "a44", "a8")

MSE %>% DT::datatable()

Calculate the average performance for each architecture:

MSE.means <- data.frame(
    mean  = map_dbl(MSE, mean),
    error = map_dbl(MSE, ~{qt(0.975, df = nrow(MSE)-1) * sd(.x)/sqrt(nrow(MSE))})
  ) %>% rownames_to_column("architecture")

MSE.means %>% 
  ggplot() +
    aes(x = architecture, y = mean, label = round(mean,1)) +
    geom_bar(stat="identity") +
    geom_errorbar(aes(ymin=mean-error, ymax=mean+error), width=.2) +
    geom_label() +
    labs(title = "Performance differences between different network architectures",
         caption = "Error bars show the 95% confidence intervals") +
    ggthemes::theme_tufte(base_size = 16)