Author: Christian Nilsen, Geosyntec Consultants Email: cnilsen@geosyntec.com
Date Created: 2019-08-02
Copyright (c) Geosyntec Consultants, 2019This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3.0
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. For a copy of the GNU General Public License see https://www.gnu.org/licenses/.
This notebook performs censored Baseyian linear regression with random effects on annual traffic volumes and pollutant concentrations. It uses data from the USGS Highway-Runoff Database (HRDB)Version 1.1.0 (Granato and Cazenas (2009)). Bayesian analsysis is performed by fitting a generalized linear mixed model using Markov chain Monte Carlo techniques and relies heavily on the `MCMCglmm package (hadfield2010mcmc).
The HRDB was a collaboration by the U.S. Geological Survey (USGS) and the Federal Highway Administration (FHWA). It provides highway-runoff data from 242 highway sites across the country; data from 6,837 storm events; and 106,441 concentration values with data for 414 different water-quality constituents.
For purposes of this study, data were limited to those sites in Washington, Oregon, and Northern California.
library(tidyverse)
library(plotMCMC)
library(dplyr)
library (MCMCglmm)
library(bayesplot)
library(hrbrthemes)
theme_set(theme_ipsum_rc())
set.seed(50)For purposes of this study,
#View(HwyDBv3)
HwyDB <- read.csv ("data/HwyDB.csv",stringsAsFactors=FALSE )
##Set local parameters
units <- 'ug/L'
#make a dataframe
params = list(
T_Copper = 'Copper, water, unfiltered, recoverable, micrograms per liter',
D_Copper = 'Copper, water, filtered, micrograms per liter',
T_Zinc = 'Zinc, water, unfiltered, recoverable, micrograms per liter',
T_Cadmium = 'Cadmium, water, filtered, micrograms per liter',
TKN = 'Ammonia plus organic nitrogen, water, unfiltered, milligrams per liter as nitrogen (TKN)',
T_Phosphorus = 'Phosphorus, water, unfiltered, milligrams per liter',
Suspended_Solids = 'Solids, suspended, water, milligrams per liter')
#Select lower threshold for ADT data
lowerBound <- 5000 #highest traffic ADT in the monitoring data#library(dplyr)
#Extract relevant data from overall data file
HwyDB <- read_csv("data/HwyDB.csv") %>%
dplyr::filter(`S Adt` > lowerBound) %>%
rename(EMC = `S EMC Value`,parameter =`T Parameter Name`,ADT = `S Adt`,Location = `Site ID`, State = `State ID`)
HwyDB$State<-as.factor(HwyDB$State)
HwyDB$Location <-as.factor(HwyDB$Location)
hwyData <- HwyDB[which(HwyDB$parameter %in% params),]
parameter_list = (unique(hwyData$parameter))
code_list = unique(hwyData$`T Pcode`)
codes <- data.frame(name = parameter_list,code = code_list,stringsAsFactors = FALSE)plot <- ggplot(data = hwyData) +geom_point(aes(x=ADT,y=EMC)) +
facet_wrap(ncol = 2, vars(parameter),scales = "free") +
theme(plot.title = element_text(hjust = 0.5))
plotadd_surv <- function(df) {
df$cenMin <- ifelse(df$NonDetectFlag,-Inf, (df$EMC))
df$cenMax <- (df$EMC)
df$cenMin_log <- ifelse(df$NonDetectFlag,-Inf, log(df$EMC))
df$cenMax_log <- log(df$EMC)
return(df)
}#### Gray Callout {.bs-callout .bs-callout-gray}
mcmc_calc <- function(coc.local,trans.bin) {
#get linear models
#selected_model <- compare_linear_models(x)[model_num][[1]] #this runs the linear model selection
df <- as.data.frame(subset(hwyData, `T Pcode` == coc.local))%>%
add_surv()
#make the prior_structures
prior.1<-list(R=list(V=1, fix=1), G=list(G1=list(V=1, nu=0.002)))
if (trans.bin) {
mcmc_formula <-
as.formula(
"cbind(cenMin_log, cenMax_log) ~ log(ADT) "
)}
else {
mcmc_formula <-
as.formula(
"cbind(cenMin_log, cenMax_log) ~ ADT "
)
}
mcmc_results <-
MCMCglmm(
mcmc_formula,
random = ~ Location,
data = df,
family = "cengaussian",
verbose = FALSE, prior = prior.1, singular.ok = TRUE,
nitt = 40000, thin = 13, burnin = 10000
)
return((mcmc_results))
}# Zn <- mcmc_calc("p01092")
# summary(Zn)
# ```
#
# ```{r}
# Zn <- mcmc_calc("p01092")
# Cu <- mcmc_calc("p01042")
# Pb <- mcmc_calc("p01051")
# Cd <- mcmc_calc("p01027")
# TP <- mcmc_calc("p00665")
# TKN <- mcmc_calc("p00625")
# TSS <- mcmc_calc("p80154") #note TSS is in mg/L # add custom functions to extract estimates (tidy) and goodness-of-fit (glance) information
tidy.MCMCglmm <- function(object, ...) {
s <- summary(object, ...)
ret <- tibble::tibble(term = row.names(s$solutions),
estimate = s$solutions[, 1],
conf.low = s$solutions[, 2],
conf.high = s$solutions[, 3])
ret
}
glance.MCMCglmm <- function(object, ...) {
ret <- tibble::tibble(dic = object$DIC,
n = nrow(object$X))
ret
}plot_mcmc_predictions <- function(coc, mcmc_results, cocName) {
training_predictions <- data.frame(#predicted = exp(
(predict(mcmc_results, interval = "confidence")))
training_predint <-
cbind(df,
data.frame(predict(mcmc_results, interval = "prediction")))
model.compare.predictions <-
cbind(df,
training_predictions)
plot <- ggplot(model.compare.predictions) + geom_abline() +
geom_ribbon(aes(x = fit, ymin = lwr, ymax = upr), alpha = 0.1) +
geom_ribbon(data = training_predint,
aes(x = fit, ymin = lwr, ymax = upr),
alpha = 0.4) +
geom_point(aes(x = exp(fit), y = (EMC)), color = 'blue',fill='white', alpha = 0.1) +
#scale_y_log10()+scale_x_log10()+
labs(
x = "Predicted Concentration ( µg/L)",
y = "Measured Concentration ( µg/L)",
title = "Predicted vs. Measured Concentrations",
subtitle = cocName,
caption = "Data from Granato, G.E., 2019, Highway-Runoff Database (HRDB) Version 1.1.0: \n U.S. Geological Survey data release, https://doi.org/10.5066/P94VL32J.")
return(plot)
}showResults <- function(coc,mcmc_results,cocName) {
df <- (subset(hwyData, `T Pcode` == coc)) %>%
add_surv()
training_predictions <- data.frame(#predicted = exp(
(predict(mcmc_results, interval = "confidence")))
training_predint <-
cbind(df,
data.frame(predict(mcmc_results, interval = "prediction")))
model.compare.predictions <-
cbind(df,
training_predictions)
predictPlot <- ggplot(model.compare.predictions) + geom_abline() +
geom_ribbon(aes(x = fit, ymin = lwr, ymax = upr), alpha = 0.1) +
geom_ribbon(data = training_predint,
aes(x = fit, ymin = lwr, ymax = upr),
alpha = 0.4) +
geom_point(aes(x = fit, y = log(EMC)),fill='white', alpha = 0.1) +
#scale_y_log10()+scale_x_log10()+
labs(
x = "Predicted Concentration ( µg/L)",
y = "Measured Concentration ( µg/L)",
title = "Predicted vs. Measured Concentrations",
subtitle = cocName,
caption = "Data from Granato, G.E., 2019, Highway-Runoff Database (HRDB) Version 1.1.0: \n U.S. Geological Survey data release, https://doi.org/10.5066/P94VL32J.")
regression_plot <-ggplot(model.compare.predictions) +
geom_boxplot(aes(y = log(EMC), x = ADT, group = Location),alpha = 0.8, outlier.alpha = 0.1) +
geom_point(aes(y = log(EMC), x = ADT, color = State), alpha = 0.4) +
geom_line(aes(y = fit, x = ADT), color = 'red',alpha = 0.8) +scale_x_log10()
solution_summary<-(summary(mcmc_results))
#bayes plots
#show uncertainty intervals under esimated posterior density curves
plot.1 <- mcmc_areas(mcmc_results$Sol,prob = 0.80, pprob_outer = 0.95,point_est="mean")+ggplot2::labs(title = coc, subtitle = "Posterior distributions with medians and 80% intervals")
#generate scatter plot of predictions
return(list(regression_plot, predictPlot, solution_summary, plot.1))
}## [1] "p01042"
## [1] "Copper, water, unfiltered, recoverable, micrograms per liter"
## [[1]]
##
## [[2]]
##
## [[3]]
##
## Iterations = 10001:39992
## Thinning interval = 13
## Sample size = 2308
##
## DIC: 2113.325
##
## G-structure: ~Location
##
## post.mean l-95% CI u-95% CI eff.samp
## Location 0.3246 0.1899 0.4915 2308
##
## R-structure: ~units
##
## post.mean l-95% CI u-95% CI eff.samp
## units 1 1 1 0
##
## Location effects: cbind(cenMin_log, cenMax_log) ~ ADT
##
## post.mean l-95% CI u-95% CI eff.samp pMCMC
## (Intercept) 2.572e+00 2.336e+00 2.813e+00 2308 <4e-04 ***
## ADT 6.929e-06 4.420e-06 1.006e-05 2053 <4e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[4]]
## [1] "p01092"
## [1] "Zinc, water, unfiltered, recoverable, micrograms per liter"
## [[1]]
##
## [[2]]
##
## [[3]]
##
## Iterations = 10001:39992
## Thinning interval = 13
## Sample size = 2308
##
## DIC: 2500.242
##
## G-structure: ~Location
##
## post.mean l-95% CI u-95% CI eff.samp
## Location 0.5683 0.3395 0.8085 2308
##
## R-structure: ~units
##
## post.mean l-95% CI u-95% CI eff.samp
## units 1 1 1 0
##
## Location effects: cbind(cenMin_log, cenMax_log) ~ log(ADT)
##
## post.mean l-95% CI u-95% CI eff.samp pMCMC
## (Intercept) 0.8014 -1.4341 2.8727 2308 0.481
## log(ADT) 0.3633 0.1673 0.5690 2308 <4e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[4]]
## [1] "p01025"
## [1] "Cadmium, water, filtered, micrograms per liter"
## [[1]]
##
## [[2]]
##
## [[3]]
##
## Iterations = 10001:39992
## Thinning interval = 13
## Sample size = 2308
##
## DIC: 552.7904
##
## G-structure: ~Location
##
## post.mean l-95% CI u-95% CI eff.samp
## Location 0.3107 0.1153 0.5484 2058
##
## R-structure: ~units
##
## post.mean l-95% CI u-95% CI eff.samp
## units 1 1 1 0
##
## Location effects: cbind(cenMin_log, cenMax_log) ~ log(ADT)
##
## post.mean l-95% CI u-95% CI eff.samp pMCMC
## (Intercept) -4.33945 -6.56712 -2.05720 2174 0.000867 ***
## log(ADT) 0.24460 0.03332 0.45008 2308 0.022530 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[4]]
## Nutrients
## [1] "p00665"
## [1] "Phosphorus, water, unfiltered, milligrams per liter"
## [[1]]
##
## [[2]]
##
## [[3]]
##
## Iterations = 10001:39992
## Thinning interval = 13
## Sample size = 2308
##
## DIC: 1800.666
##
## G-structure: ~Location
##
## post.mean l-95% CI u-95% CI eff.samp
## Location 0.523 0.2848 0.7971 2308
##
## R-structure: ~units
##
## post.mean l-95% CI u-95% CI eff.samp
## units 1 1 1 0
##
## Location effects: cbind(cenMin_log, cenMax_log) ~ ADT
##
## post.mean l-95% CI u-95% CI eff.samp pMCMC
## (Intercept) -1.830e+00 -2.134e+00 -1.488e+00 2308 <4e-04 ***
## ADT 1.460e-07 -3.532e-06 3.802e-06 2308 0.942
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[4]]
## [1] "p00625"
## [1] "Ammonia plus organic nitrogen, water, unfiltered, milligrams per liter as nitrogen (TKN)"
## [[1]]
##
## [[2]]
##
## [[3]]
##
## Iterations = 10001:39992
## Thinning interval = 13
## Sample size = 2308
##
## DIC: 1748.872
##
## G-structure: ~Location
##
## post.mean l-95% CI u-95% CI eff.samp
## Location 0.08917 0.03155 0.1581 2308
##
## R-structure: ~units
##
## post.mean l-95% CI u-95% CI eff.samp
## units 1 1 1 0
##
## Location effects: cbind(cenMin_log, cenMax_log) ~ log(ADT)
##
## post.mean l-95% CI u-95% CI eff.samp pMCMC
## (Intercept) -2.4795 -3.7827 -1.2783 3435 <4e-04 ***
## log(ADT) 0.2603 0.1402 0.3773 3435 <4e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[4]]
## [1] "p00530"
## [1] "Solids, suspended, water, milligrams per liter"
## [[1]]
##
## [[2]]
##
## [[3]]
##
## Iterations = 10001:39992
## Thinning interval = 13
## Sample size = 2308
##
## DIC: 3112.74
##
## G-structure: ~Location
##
## post.mean l-95% CI u-95% CI eff.samp
## Location 1.188 0.7539 1.667 2308
##
## R-structure: ~units
##
## post.mean l-95% CI u-95% CI eff.samp
## units 1 1 1 0
##
## Location effects: cbind(cenMin_log, cenMax_log) ~ ADT
##
## post.mean l-95% CI u-95% CI eff.samp pMCMC
## (Intercept) 3.979e+00 3.555e+00 4.408e+00 2308 <4e-04 ***
## ADT 1.640e-06 -3.679e-06 6.138e-06 2560 0.514
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[4]]
Granato, G E, and P A Cazenas. 2009. “Highway-Runoff Database (HRDB Version 1.0)—A Data Warehouse and Preprocessor for the Stochastic Empirical Loading and Dilution Model.” Federal Highway Administration. Washington: US Department of Transportation.