Author: Christian Nilsen, Geosyntec Consultants Email:

Date Created: 2019-08-02

Copyright (c) Geosyntec Consultants, 2019

License

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3.0

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. For a copy of the GNU General Public License see https://www.gnu.org/licenses/.

Introduction

This notebook performs censored Baseyian linear regression with random effects on annual traffic volumes and pollutant concentrations. It uses data from the USGS Highway-Runoff Database (HRDB)Version 1.1.0 (Granato and Cazenas (2009)). Bayesian analsysis is performed by fitting a generalized linear mixed model using Markov chain Monte Carlo techniques and relies heavily on the `MCMCglmm package (hadfield2010mcmc).

Background

Methods

The HRDB was a collaboration by the U.S. Geological Survey (USGS) and the Federal Highway Administration (FHWA). It provides highway-runoff data from 242 highway sites across the country; data from 6,837 storm events; and 106,441 concentration values with data for 414 different water-quality constituents.

For purposes of this study, data were limited to those sites in Washington, Oregon, and Northern California.

library(tidyverse)
library(plotMCMC)
library(dplyr)
library (MCMCglmm)
library(bayesplot)
library(hrbrthemes)
theme_set(theme_ipsum_rc())
set.seed(50)

For purposes of this study,

#View(HwyDBv3)
HwyDB <- read.csv ("data/HwyDB.csv",stringsAsFactors=FALSE )

##Set local parameters 

units <- 'ug/L'
#make a dataframe 
params = list(

T_Copper = 'Copper, water, unfiltered, recoverable, micrograms per liter',
D_Copper = 'Copper, water, filtered, micrograms per liter',
T_Zinc = 'Zinc, water, unfiltered, recoverable, micrograms per liter',
T_Cadmium = 'Cadmium, water, filtered, micrograms per liter',
TKN = 'Ammonia plus organic nitrogen, water, unfiltered, milligrams per liter as nitrogen (TKN)',
T_Phosphorus = 'Phosphorus, water, unfiltered, milligrams per liter',
Suspended_Solids = 'Solids, suspended, water, milligrams per liter')

#Select lower threshold for ADT data
lowerBound <- 5000 #highest traffic ADT in the monitoring data
#library(dplyr)
#Extract relevant data from overall data file



HwyDB <- read_csv("data/HwyDB.csv") %>%
  dplyr::filter(`S Adt` > lowerBound) %>% 
  rename(EMC = `S EMC Value`,parameter =`T Parameter Name`,ADT = `S Adt`,Location = `Site ID`, State = `State ID`) 

HwyDB$State<-as.factor(HwyDB$State)
HwyDB$Location <-as.factor(HwyDB$Location)

hwyData <- HwyDB[which(HwyDB$parameter %in% params),]



parameter_list = (unique(hwyData$parameter))
code_list = unique(hwyData$`T Pcode`)
codes <- data.frame(name = parameter_list,code = code_list,stringsAsFactors = FALSE)
plot <- ggplot(data = hwyData) +geom_point(aes(x=ADT,y=EMC)) + 
    facet_wrap(ncol = 2, vars(parameter),scales = "free") +
    theme(plot.title = element_text(hjust = 0.5))

plot

add_surv <- function(df) {
  df$cenMin <- ifelse(df$NonDetectFlag,-Inf, (df$EMC))
  df$cenMax  <- (df$EMC)
  df$cenMin_log <- ifelse(df$NonDetectFlag,-Inf, log(df$EMC))
  df$cenMax_log  <- log(df$EMC)
  return(df)
}
#### Gray Callout {.bs-callout .bs-callout-gray}

mcmc_calc function

mcmc_calc <- function(coc.local,trans.bin) {
  #get linear models
  #selected_model <- compare_linear_models(x)[model_num][[1]]  #this runs the linear model selection
  df <- as.data.frame(subset(hwyData, `T Pcode` == coc.local))%>% 
    add_surv()


#make the prior_structures 
prior.1<-list(R=list(V=1, fix=1), G=list(G1=list(V=1, nu=0.002)))



if (trans.bin) {
mcmc_formula <-
      as.formula(
         "cbind(cenMin_log, cenMax_log) ~  log(ADT) "
      )}
else {
  mcmc_formula <-
      as.formula(
         "cbind(cenMin_log, cenMax_log) ~  ADT "
      )
}

mcmc_results <-
    MCMCglmm(
      mcmc_formula,
      random = ~  Location,
      data =  df, 
      family = "cengaussian",
      verbose = FALSE, prior = prior.1,  singular.ok = TRUE,
      nitt = 40000, thin = 13, burnin = 10000
    )
    return((mcmc_results))
}

parameters

# Zn <- mcmc_calc("p01092")
# summary(Zn)
# ```
# 
# ```{r}
# Zn <- mcmc_calc("p01092")
# Cu <- mcmc_calc("p01042")
# Pb <- mcmc_calc("p01051")
# Cd <- mcmc_calc("p01027")
# TP <- mcmc_calc("p00665")
# TKN <- mcmc_calc("p00625")
# TSS <- mcmc_calc("p80154") #note TSS is in mg/L 
# add custom functions to extract estimates (tidy) and goodness-of-fit (glance) information
tidy.MCMCglmm <- function(object, ...) {
    s <- summary(object, ...)
    ret <- tibble::tibble(term = row.names(s$solutions),
                          estimate = s$solutions[, 1],
                          conf.low = s$solutions[, 2],
                          conf.high = s$solutions[, 3])
    ret
}
glance.MCMCglmm <- function(object, ...) {
    ret <- tibble::tibble(dic = object$DIC,
                          n = nrow(object$X))
    ret
}

Predictions

plot_mcmc_predictions <- function(coc, mcmc_results, cocName) {
 
  
  training_predictions <- data.frame(#predicted = exp(
    (predict(mcmc_results, interval = "confidence")))
  
  training_predint <-
    cbind(df,
          data.frame(predict(mcmc_results, interval = "prediction")))
  
  model.compare.predictions <-
    cbind(df,
          training_predictions)
  
  plot <- ggplot(model.compare.predictions) + geom_abline() +
    geom_ribbon(aes(x = fit, ymin = lwr, ymax = upr), alpha = 0.1) +
    geom_ribbon(data = training_predint,
                aes(x = fit, ymin = lwr, ymax = upr),
                alpha = 0.4) +
    geom_point(aes(x = exp(fit), y = (EMC)), color = 'blue',fill='white', alpha = 0.1) + 
    
   #scale_y_log10()+scale_x_log10()+
    labs(
      x = "Predicted Concentration ( µg/L)",
      y = "Measured Concentration ( µg/L)",
      title =  "Predicted vs. Measured Concentrations",
      subtitle = cocName,
      caption = "Data from Granato, G.E., 2019, Highway-Runoff Database (HRDB) Version 1.1.0: \n U.S. Geological Survey data release, https://doi.org/10.5066/P94VL32J.")
  return(plot)
}

show results

showResults <- function(coc,mcmc_results,cocName) {
   df <- (subset(hwyData, `T Pcode` == coc)) %>%
    add_surv()
   
  training_predictions <- data.frame(#predicted = exp(
    (predict(mcmc_results, interval = "confidence")))
  
  training_predint <-
    cbind(df,
          data.frame(predict(mcmc_results, interval = "prediction")))
  
  model.compare.predictions <-
    cbind(df,
          training_predictions)
  
  predictPlot <- ggplot(model.compare.predictions) + geom_abline() +
    geom_ribbon(aes(x = fit, ymin = lwr, ymax = upr), alpha = 0.1) +
    geom_ribbon(data = training_predint,
                aes(x = fit, ymin = lwr, ymax = upr),
                alpha = 0.4) +
    geom_point(aes(x = fit, y = log(EMC)),fill='white', alpha = 0.1) + 
    
   #scale_y_log10()+scale_x_log10()+
    labs(
      x = "Predicted Concentration ( µg/L)",
      y = "Measured Concentration ( µg/L)",
      title =  "Predicted vs. Measured Concentrations",
      subtitle = cocName,
      caption = "Data from Granato, G.E., 2019, Highway-Runoff Database (HRDB) Version 1.1.0: \n U.S. Geological Survey data release, https://doi.org/10.5066/P94VL32J.")

  regression_plot <-ggplot(model.compare.predictions) + 
    geom_boxplot(aes(y = log(EMC), x = ADT, group = Location),alpha = 0.8, outlier.alpha = 0.1) +
    geom_point(aes(y = log(EMC), x = ADT, color = State), alpha = 0.4) +
     
    geom_line(aes(y = fit, x = ADT), color = 'red',alpha = 0.8) +scale_x_log10()
    
    
  solution_summary<-(summary(mcmc_results))
  
  #bayes plots 
  
  #show uncertainty intervals under esimated posterior density curves 
plot.1 <- mcmc_areas(mcmc_results$Sol,prob = 0.80, pprob_outer = 0.95,point_est="mean")+ggplot2::labs(title = coc, subtitle   = "Posterior distributions with medians and 80% intervals")

#generate scatter plot of predictions 





  
  return(list(regression_plot, predictPlot, solution_summary, plot.1))
}

Summarize Results

# Do some predictive checks 

color_scheme_set("blue")
bayes_plots <- function(fit,coc) {
fit <- TSS
coc = param.code
#coc = 'Total Suspended Solids - Water - Total'

df <- (subset(hwyData, `T Pcode` == coc)) %>%
  add_surv()




return(list(plot.1,plot.2))
}

Generate Regressions

Metals

Copper

## [1] "p01042"
## [1] "Copper, water, unfiltered, recoverable, micrograms per liter"
## [[1]]

## 
## [[2]]

## 
## [[3]]
## 
##  Iterations = 10001:39992
##  Thinning interval  = 13
##  Sample size  = 2308 
## 
##  DIC: 2113.325 
## 
##  G-structure:  ~Location
## 
##          post.mean l-95% CI u-95% CI eff.samp
## Location    0.3246   0.1899   0.4915     2308
## 
##  R-structure:  ~units
## 
##       post.mean l-95% CI u-95% CI eff.samp
## units         1        1        1        0
## 
##  Location effects: cbind(cenMin_log, cenMax_log) ~ ADT 
## 
##             post.mean  l-95% CI  u-95% CI eff.samp  pMCMC    
## (Intercept) 2.572e+00 2.336e+00 2.813e+00     2308 <4e-04 ***
## ADT         6.929e-06 4.420e-06 1.006e-05     2053 <4e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[4]]

Zinc

## [1] "p01092"
## [1] "Zinc, water, unfiltered, recoverable, micrograms per liter"
## [[1]]

## 
## [[2]]

## 
## [[3]]
## 
##  Iterations = 10001:39992
##  Thinning interval  = 13
##  Sample size  = 2308 
## 
##  DIC: 2500.242 
## 
##  G-structure:  ~Location
## 
##          post.mean l-95% CI u-95% CI eff.samp
## Location    0.5683   0.3395   0.8085     2308
## 
##  R-structure:  ~units
## 
##       post.mean l-95% CI u-95% CI eff.samp
## units         1        1        1        0
## 
##  Location effects: cbind(cenMin_log, cenMax_log) ~ log(ADT) 
## 
##             post.mean l-95% CI u-95% CI eff.samp  pMCMC    
## (Intercept)    0.8014  -1.4341   2.8727     2308  0.481    
## log(ADT)       0.3633   0.1673   0.5690     2308 <4e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[4]]

Cadmium

## [1] "p01025"
## [1] "Cadmium, water, filtered, micrograms per liter"
## [[1]]

## 
## [[2]]

## 
## [[3]]
## 
##  Iterations = 10001:39992
##  Thinning interval  = 13
##  Sample size  = 2308 
## 
##  DIC: 552.7904 
## 
##  G-structure:  ~Location
## 
##          post.mean l-95% CI u-95% CI eff.samp
## Location    0.3107   0.1153   0.5484     2058
## 
##  R-structure:  ~units
## 
##       post.mean l-95% CI u-95% CI eff.samp
## units         1        1        1        0
## 
##  Location effects: cbind(cenMin_log, cenMax_log) ~ log(ADT) 
## 
##             post.mean l-95% CI u-95% CI eff.samp    pMCMC    
## (Intercept)  -4.33945 -6.56712 -2.05720     2174 0.000867 ***
## log(ADT)      0.24460  0.03332  0.45008     2308 0.022530 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[4]]

## Nutrients

Phosphorus

## [1] "p00665"
## [1] "Phosphorus, water, unfiltered, milligrams per liter"
## [[1]]

## 
## [[2]]

## 
## [[3]]
## 
##  Iterations = 10001:39992
##  Thinning interval  = 13
##  Sample size  = 2308 
## 
##  DIC: 1800.666 
## 
##  G-structure:  ~Location
## 
##          post.mean l-95% CI u-95% CI eff.samp
## Location     0.523   0.2848   0.7971     2308
## 
##  R-structure:  ~units
## 
##       post.mean l-95% CI u-95% CI eff.samp
## units         1        1        1        0
## 
##  Location effects: cbind(cenMin_log, cenMax_log) ~ ADT 
## 
##              post.mean   l-95% CI   u-95% CI eff.samp  pMCMC    
## (Intercept) -1.830e+00 -2.134e+00 -1.488e+00     2308 <4e-04 ***
## ADT          1.460e-07 -3.532e-06  3.802e-06     2308  0.942    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[4]]

TKN

## [1] "p00625"
## [1] "Ammonia plus organic nitrogen, water, unfiltered, milligrams per liter as nitrogen (TKN)"
## [[1]]

## 
## [[2]]

## 
## [[3]]
## 
##  Iterations = 10001:39992
##  Thinning interval  = 13
##  Sample size  = 2308 
## 
##  DIC: 1748.872 
## 
##  G-structure:  ~Location
## 
##          post.mean l-95% CI u-95% CI eff.samp
## Location   0.08917  0.03155   0.1581     2308
## 
##  R-structure:  ~units
## 
##       post.mean l-95% CI u-95% CI eff.samp
## units         1        1        1        0
## 
##  Location effects: cbind(cenMin_log, cenMax_log) ~ log(ADT) 
## 
##             post.mean l-95% CI u-95% CI eff.samp  pMCMC    
## (Intercept)   -2.4795  -3.7827  -1.2783     3435 <4e-04 ***
## log(ADT)       0.2603   0.1402   0.3773     3435 <4e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[4]]

TSS

## [1] "p00530"
## [1] "Solids, suspended, water, milligrams per liter"
## [[1]]

## 
## [[2]]

## 
## [[3]]
## 
##  Iterations = 10001:39992
##  Thinning interval  = 13
##  Sample size  = 2308 
## 
##  DIC: 3112.74 
## 
##  G-structure:  ~Location
## 
##          post.mean l-95% CI u-95% CI eff.samp
## Location     1.188   0.7539    1.667     2308
## 
##  R-structure:  ~units
## 
##       post.mean l-95% CI u-95% CI eff.samp
## units         1        1        1        0
## 
##  Location effects: cbind(cenMin_log, cenMax_log) ~ ADT 
## 
##              post.mean   l-95% CI   u-95% CI eff.samp  pMCMC    
## (Intercept)  3.979e+00  3.555e+00  4.408e+00     2308 <4e-04 ***
## ADT          1.640e-06 -3.679e-06  6.138e-06     2560  0.514    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[4]]

Bibliography

Granato, G E, and P A Cazenas. 2009. “Highway-Runoff Database (HRDB Version 1.0)—A Data Warehouse and Preprocessor for the Stochastic Empirical Loading and Dilution Model.” Federal Highway Administration. Washington: US Department of Transportation.