---
title: "QUBES Lesson: Oysters and Water Quality in Virginia"
author: "Julia Josephs"
output: html_notebook
---
This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*.
```{r}
#load your libraries - if you don't have them use the install.packages("") function to install them.
knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(tidyverse)
library(lubridate)
library(ggplot2)
library(knitr)
#bring in the James River oyster data - all located in the lower James
oysterjames <- read.csv("LowerJamesOysters.csv")
#look at the data
oysterjames
#note that the data is in character form
summary(oysterjames)
```
```{r}
#manipulate the data using tidyverse- I will do it step by step and then all together
#separate out the columns you need - Year, Total
oysterjames%>%
select(Year, Total) -> oysterjames1
oysterjames1
#convert year and total to numeric from character
oysterjames1 %>%
mutate_if(is.character,as.numeric)-> oysterjames2
oysterjames2
#arrange by year
oysterjames2%>%
arrange(Year) -> oysterjames3
oysterjames3
#get average Total for each year
oysterjames3%>%
group_by(Year)%>%
mutate(mean(Total))%>%
rename("average" = "mean(Total)")%>%
select(Year, average)-> oysterjames4
oysterjames4
```
```{r}
#all of the steps above but all together
oysterjames%>%
select(Year, Total)%>%
mutate_if(is.character,as.numeric)%>%
arrange(Year)%>%
group_by(Year)%>%
mutate(mean(Total))%>%
rename("average" = "mean(Total)")%>%
select(Year, average)-> oystersjamesave
oystersjamesave
#get rid of duplicate rows
oystersjamesave%>%
distinct(Year, average, .keep_all = TRUE)->oystersjamesave2
oystersjamesave2
```
```{r}
#look at a histogram of the data to see if it looks normal
hist(oystersjamesave2$Year)
hist(oystersjamesave2$average)
#the year data looks normally distributed but the average data does not so we can log it to transform it and make it more normal
hist(log(oystersjamesave2$average))
```
```{r}
#plot data to visualize how the number of oysters changes through time
#We are going to create two scatterplots in order to visually see the relationship between the two variables, total oysters and year. We must do this before running the other statistical tests because if the plot does not show any increasing or decreasing trends, a linear regression model will not be useful.
oystersjamesave%>%
ggplot(aes(x=Year, y=average))+
geom_point()+
expand_limits(x = 2020)+
ggtitle("Average Total Oysters in the James River by Year")+
ylab("Average Total Oysters")+
geom_line()
#plot the data to visualize the relationship between oysters and year
ggplot(data=oystersjamesave2, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Average Total Oysters")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
expand_limits(x=2020)
```
```{r}
#run a pearson's correlation test to measure how strong a relationship is between two variables by looking at the correlation coefficient: 1 indicates a strong positive relationship, -1 indicates a strong negative relationship, and a result of zero indicates no relationship at all.
cor.test(oystersjamesave2$Year, oystersjamesave2$average)
#the correlation coefficient is 0.8215589, which is very close to 1 meaning there is a strong positive relationship between year and oysters
#make a linear regression model to find a relationship between year and oysters, Linear regression attempts to model the relationship between two variables by fitting a linear equation to observed data.
fit.0 <- lm(Year ~ average, data=oystersjamesave2)
fit.0
summary(fit.0)
#because the p-value is low and the multiple R-squared is high, there is a good fit and a relationship does exist
```
```{r}
#bring in the lower James river water quality data
waterqualityjames <- read.csv("LowerJamesWaterQuality.csv")
waterqualityjames
```
```{r}
#we are going to start by looking at total nitrogen
#manipulate the data- select only the columns we need, filter out the parameter we need, total nitrogen, separate the date, make the data numeric, make the years go in chronological order, get the average total nitrogen per year
waterqualityjames%>%
select(Parameter, MeasureValue, SampleDate)%>%
filter(Parameter=="TN")%>%
separate(SampleDate, c("month", "day", "year"))%>%
mutate_if(is.character,as.numeric)%>%
arrange(year)%>%
group_by(year)%>%
mutate(mean(MeasureValue))%>%
rename("average" = "mean(MeasureValue)")%>%
select(year, average)%>%
rename(Year=year)-> waterjamesTN
waterjamesTN
#remove duplicate rows
waterjamesTN%>%
distinct(Year, average, .keep_all = TRUE)->waterjamesTN2
waterjamesTN2
```
```{r}
#plot total nitrogen data through time with labels and a title
waterjamesTN2%>%
ggplot(aes(x=Year, y= average)) +
geom_point()+
expand_limits(x=2020)+
ggtitle("Total Nitrogen in the James River by Year")+
ylab("Average Total Nitrogen")
#plot the data to visualize the relationship between total nitrogen and year
ggplot(data=waterjamesTN2, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Total Nitrogen")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
expand_limits(x=2020)
```
```{r}
#run a pearson's correlation test to see if the variables year and total nitrogen are related
cor.test(waterjamesTN2$Year, waterjamesTN2$average)
#make a linear regression model to find a relationship between year and total nitrogen
fit.1.0 <- lm(Year ~ average, data=waterjamesTN2)
fit.1.0
summary(fit.1.0)
```
```{r}
#next we are going to look at chlorophyll-A in the James
#manipulate the data as before but with the parameter CHLA
waterqualityjames%>%
select(Parameter, MeasureValue, SampleDate)%>%
filter(Parameter=="CHLA")%>%
separate(SampleDate, c("month", "day", "year"))%>%
mutate_if(is.character,as.numeric)%>%
filter(year>=2008)%>%
arrange(year)%>%
group_by(year)%>%
mutate(mean(MeasureValue))%>%
rename("average" = "mean(MeasureValue)")%>%
select(year, average)%>%
rename(Year=year)-> waterjamesCHL
waterjamesCHL
#remove duplicate rows
waterjamesCHL%>%
distinct(Year, average, .keep_all = TRUE)->waterjamesCHL2
waterjamesCHL2
```
```{r}
#plot the data to visualize how it changes through time
waterjamesCHL2%>%
ggplot(aes(x=Year, y= average)) +
geom_point()+
ggtitle("Chlorophyll-A in the James River by Year")+
ylab("Chlorophyll-A")+
expand_limits(x=2020)
#plot the data to visualize the relationship between chlorophyll-A and year
ggplot(data=waterjamesCHL2, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Total Chlorophyll-A")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
expand_limits(x=2020)
```
```{r}
#run a pearson's correlation test to see if the variables year and chlorophyll-A are related
cor.test(waterjamesCHL2$Year, waterjamesCHL2$average)
#make a linear regression model to find a relationship between year and chlorophyll-A
fit.2 <- lm(Year ~ average, data=waterjamesCHL2)
fit.2
summary(fit.2)
```
```{r}
#next is total phosphorus in the James
#manipulate the data as before but with TP
waterqualityjames%>%
select(Parameter, MeasureValue, SampleDate)%>%
filter(Parameter=="TP")%>%
separate(SampleDate, c("month", "day", "year"))%>%
mutate_if(is.character,as.numeric)%>%
arrange(year)%>%
group_by(year)%>%
mutate(mean(MeasureValue))%>%
rename("average" = "mean(MeasureValue)")%>%
select(year, average)%>%
rename(Year=year)-> waterjamesTP
waterjamesTP
#remove duplicate rows
waterjamesTP%>%
distinct(Year, average, .keep_all = TRUE)->waterjamesTP2
waterjamesTP2
```
```{r}
#plot data
waterjamesTP2%>%
ggplot(aes(x=Year, y= average)) +
geom_point()+
expand_limits(x=2020)+
ggtitle("Total Phosphorus in the James River by Year")+
ylab("Average Total Phosphorus")
#plot the data to visualize the relationship between TP and year
ggplot(data=waterjamesTP2, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Total Phosphorus")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
expand_limits(x=2020)
```
```{r}
#run a pearson's correlation test to see if the variables year and total phosphorus are related
cor.test(waterjamesTP2$Year, waterjamesTP2$average)
#make a linear regression model to find a relationship between year and total phosphorus
fit.3 <- lm(Year ~ average, data=waterjamesTP2)
fit.3
summary(fit.3)
```
```{r}
#last is turbidity in the james
#manipulate the data as before but with TURB_NTU
waterqualityjames
waterqualityjames%>%
select(Parameter, MeasureValue, SampleDate)%>%
filter(Parameter=="TURB_NTU")%>%
separate(SampleDate, c("month", "day", "year"))%>%
mutate_if(is.character,as.numeric)%>%
arrange(year)%>%
group_by(year)%>%
mutate(mean(MeasureValue))%>%
rename("average" = "mean(MeasureValue)")%>%
select(year, average)%>%
rename(Year=year)-> waterjamesTURB
waterjamesTURB
#remove duplicate rows
waterjamesTURB%>%
distinct(Year, average, .keep_all = TRUE)->waterjamesTURB2
waterjamesTURB2
```
```{r}
#lets look at suspended solids instead since turbidity is missing a lot of data
TSS <- read.csv("TSSlowerjames.csv")
TSS%>%
select(Parameter, MeasureValue, SampleDate)%>%
filter(Parameter=="TSS")%>%
separate(SampleDate, c("month", "day", "year"))%>%
mutate_if(is.character,as.numeric)%>%
arrange(year)%>%
group_by(year)%>%
mutate(mean(MeasureValue))%>%
rename("average" = "mean(MeasureValue)")%>%
select(year, average)%>%
rename(Year=year)-> waterjamesTSS
waterjamesTSS
waterjamesTSS%>%
distinct(Year, average, .keep_all = TRUE)->waterjamesTSS2
waterjamesTSS2
```
```{r}
#plot data
waterjamesTSS2%>%
ggplot(aes(x=Year, y= average)) +
geom_point()+
expand_limits(x=2020)+
ggtitle("Total Suspended Solids in the James River by Year")+
ylab("Total Suspended Solids")
#plot the data to visualize the relationship between TTS and year
ggplot(data=waterjamesTSS2, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Total Suspended Solids")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
expand_limits(x=2020)
```
```{r}
#run a pearson's correlation test to see if the variables year and total suspended solids are related
cor.test(waterjamesTSS2$Year, waterjamesTSS2$average)
#make a linear regression model to find a relationship between year and total suspended solids
fit.4 <- lm(Year ~ average, data=waterjamesTSS2)
fit.4
summary(fit.4)
```
```{r}
#this is where the students'.RMD file becomes different - they will not have this code, they will have to do it on their own for homework
#load in the Rappahannock oyster data
oysterrapp1 <- read_csv("LowerRappOysters.csv")
#look at the data
oysterrapp1
#note that the data is in character form
summary(oysterrapp1)
```
```{r}
#separate out the columns you need - Year, Total
#convert year and total to numeric from character
#arrange by year
#get average Total for each year
oysterrapp1%>%
select(Year, Total)%>%
mutate_if(is.character,as.numeric)%>%
arrange(Year)%>%
group_by(Year)%>%
mutate(mean(Total))%>%
rename("average" = "mean(Total)")%>%
select(Year, average)-> oystersaverage2
oystersaverage2
#remove duplicate rows
oystersaverage2%>%
distinct(Year, average, .keep_all = TRUE)->oystersrappave3
oystersrappave3
```
```{r}
#plot total oysters through time
oystersrappave3%>%
ggplot(aes(x=Year, y=average))+
geom_point()+
expand_limits(x = 2020)+
ggtitle("Average Total Oysters in the Rappahannock by Year")+
ylab("Average Total Oysters")+
geom_line()
#plot total oysters in relation to year
ggplot(data=oystersrappave3, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Total Average Oysters")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)
```
```{r}
#correlation test
cor.test(oystersrappave3$Year, oystersrappave3$average)
#linear regression
fit.5 <- lm(Year ~ average, data=oystersrappave3)
fit.5
summary(fit.5)
```
```{r}
#get water quality data for RAPP
waterqualityrapp2 <- read_csv("LowerRappahannockWaterQuality.csv")
waterqualityrapp2
```
```{r}
#first we are going to look at total nitrogen
#the Rappahannock oyster data starts in 2000, so we have to manipulate the water quality data to start in 2000 as well
waterqualityrapp2%>%
select(Parameter, MeasureValue, SampleDate)%>%
filter(Parameter=="TN")%>%
separate(SampleDate, c("month", "day", "year"))%>%
filter(year>=2000)%>%
mutate_if(is.character,as.numeric)%>%
arrange(year)%>%
group_by(year)%>%
mutate(mean(MeasureValue))%>%
rename("average" = "mean(MeasureValue)")%>%
select(year, average)%>%
rename(Year=year)-> waterrappTN
waterrappTN
#remove duplicate rows
waterrappTN%>%
distinct(Year, average, .keep_all = TRUE)->waterrappTN2
waterrappTN2
```
```{r}
#let's take a look at the data
hist(oystersaverage$average)
hist(waterrappTN$average)
#data does not look normally distributed, so we are going to log it to transform it
hist(log(oystersaverage$average, 10), main = NULL, xlab="Logged Average Total Oysters")
hist(log(waterrappTN$average, 10), main = NULL, xlab="Logged Average Total Nitrogen")
```
```{r}
#plot total nitrogen data through time with labels and a title
waterrappTN2%>%
ggplot(aes(x=Year, y= average)) +
geom_point()+
expand_limits(x=2020)+
ggtitle("Average Total Nitrogen in the Rappahannock by Year")+
ylab("Average Total Nitrogen")
#plot the relationship between TN and year
ggplot(data=waterrappTN2, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Total Nitrogen")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)
#since there is no visible trend, the line is not increasing or decreasing, we will not proceed with the correlation test or the linear regression model because we cannot visually establish a relationship between the variables.
```
```{r}
#chlorophyll-A RAPP
#manipulate data
waterqualityrapp2%>%
select(Parameter, MeasureValue, SampleDate)%>%
filter(Parameter=="CHLA")%>%
separate(SampleDate, c("month", "day", "year"))%>%
filter(year>=2000)%>%
mutate_if(is.character,as.numeric)%>%
arrange(year)%>%
group_by(year)%>%
mutate(mean(MeasureValue))%>%
rename("average" = "mean(MeasureValue)")%>%
select(year, average)%>%
rename(Year=year)-> waterrappCHL
waterrappCHL
#remove duplicate rows
waterrappCHL%>%
distinct(Year, average, .keep_all = TRUE)->waterrappCHL2
waterrappCHL2
```
```{r}
#plot data through time
waterrappCHL2%>%
ggplot(aes(x=Year, y= average)) +
geom_point()+
expand_limits(x=2020)+
ggtitle("Total Chlorophyll-A in the Rappahannock by Year")+
ylab("Total Chlorophyll-A")
#plot the relationship between CHL and year
ggplot(data=waterrappCHL2, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Total Chlorophyll-A")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)
```
```{r}
#correlation test
cor.test(waterrappCHL2$Year, waterrappCHL2$average)
#linear regression
fit.7 <- lm(Year ~ average, data=waterrappCHL2)
fit.7
summary(fit.7)
```
```{r}
#total phosphorus RAPP
waterqualityrapp2%>%
select(Parameter, MeasureValue, SampleDate)%>%
filter(Parameter=="TP")%>%
separate(SampleDate, c("month", "day", "year"))%>%
filter(year>=2000)%>%
mutate_if(is.character,as.numeric)%>%
arrange(year)%>%
group_by(year)%>%
mutate(mean(MeasureValue))%>%
rename("average" = "mean(MeasureValue)")%>%
select(year, average)%>%
rename(Year=year)-> waterrappTP
waterrappTP
#remove duplicate rows
waterrappTP%>%
distinct(Year, average, .keep_all = TRUE)->waterrappTP2
waterrappTP2
#look at the data distribution
hist(waterrappTP$average)
hist(log(waterrappTP$average, 10), main = NULL, xlab="Logged Average Total Phosphorus")
```
```{r}
#plot the data through time
waterrappTP2%>%
ggplot(aes(x=Year, y= average)) +
geom_point()+
expand_limits(x=2020)+
ggtitle("Average Total Phosphorus in the Rappahannock by Year")+
ylab("Average Total Phosphorus")
#plot the relationship between TP and year
ggplot(data=waterrappTP2, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Total Phosphorus")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)
```
```{r}
#correlation test
cor.test(waterrappTP2$Year, waterrappTP2$average)
#linear regression
fit.8 <- lm(Year ~ average, data=waterrappTP2)
fit.8
summary(fit.8)
```
```{r}
#turbitidy RAPP
#TURB_NTU
waterqualityrapp2%>%
select(Parameter, MeasureValue, SampleDate)%>%
filter(Parameter=="TURB_NTU")%>%
separate(SampleDate, c("month", "day", "year"))%>%
filter(year>=2000)%>%
mutate_if(is.character,as.numeric)%>%
arrange(year)%>%
group_by(year)%>%
mutate(mean(MeasureValue))%>%
rename("average" = "mean(MeasureValue)")%>%
select(year, average)%>%
rename(Year=year)-> waterrappTURB
waterrappTURB
#remove duplicate rows
waterrappTURB%>%
distinct(Year, average, .keep_all = TRUE)->waterrappTURB2
waterrappTURB2
```
```{r}
#plot turbidity through time to visualize with title and labels
waterrappTURB2%>%
ggplot(aes(x=Year, y= average)) +
geom_point()+
ggtitle("Average Turbidity in the Rappahannock by Year")+
ylab("Average Turbidity")
#plot the relationship between Turbididy and year
ggplot(data=waterrappTURB2, aes(Year, average)) +
geom_point()+
labs(x="Year", y = "Total Turbidity")+
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)
#since there is no visible trend, the line is not increasing or decreasing, we will not proceed with the correlation test or the linear regression model because we cannot visually establish a relationship between the variables.
```