#####################################################################
### Peak Picking Script ###
### The code in this file can be used to analyze a ###
### folder with dat files generated by the Picarro. ###
### First it reads in all of the files in a folder ###
### (including subfolders). Next, it calculates the ###
### slope for h2o vs. time. Then it picks all of the ###
### peaks. It saves a plot showing the picked peaks. ###
### Finally, it averages the isotope values for ###
### each peak. This script is meant to be run in ###
### sections. ###
#####################################################################
#####################################################################
### Section 1 - Organize your data to be analyzed
#####################################################################
setwd("")
## update this to your working directory where you would like to
## the plots & csv generated by this script to be saved
## e.g. "C:/Users/u0989124/Dropbox/SPATIAL lab"
folder <- ""
## update this with the filepath to folder that contains the dat
## files to be analyzed eg.
## "C:/Users/u0989124/Dropbox/SPATIAL lab/hids2052/150209"
peaks.read <- function(filepath){
## peaks.read is a function that will read files from all
## sub-folders of the directory named in filepath and combine them
## into one dataframe
files <- list.files(path=filepath, recursive = T,
full.names = T)
## lists the file names in the filepath directory
tables <- lapply(files, read.table,header = T,
stringsAsFactors = F)
## reads in the files
df <- do.call("rbind",tables)
## combines all the tables into one dataframe
df2 <- df[,c(1:2,17:19)]
## subsets the dataframe to include only necessary columns
df3 <- df2[complete.cases(df2),]
## subsets the dataframe to include only complete cases,
## eliminating rows at the end that aren't complete
return(df3)
}
data <- peaks.read(folder)
## using the peaks.read function, this reads in all of the
## data and puts it into a single dataframe while making
## some modifications to the dataframe to make it easier
## to manipulate. This will take awhile since there is a
## lot of data to be processed.
data$seconds <- as.numeric(row.names(data))
## creates a column that gives the second that each row was
## recorded from 1 to the end of the dataframe
plot(data$seconds, data$H2O)
## this plot should help you visualize the data so you
## know how to modify or subset it. you will want to subset
## the data to include only the section with the peaks
## you want to analyze.
data2 <- data[data$seconds > (head(data$seconds[data$H2O > 17000],1)-30) &
data$seconds < (tail(data$seconds[data$H2O > 17000],1)+30),]
## this may be a useful way to subset your data because it starts
## 30 seconds before the first peak and ends 30 seconds after
## the last peak. It also creates a copy of data so that if you need
## to go back to the original data you don't have to read it in again
plot(data2$seconds, data2$H2O)
## plots seconds vs water concentration, review this along
## with the next two plots to determine that the data have
## been subsetted appropriately
plot(data2$seconds[1:1000], data2$H2O[1:1000])
## plots seconds vs water concentration for the first 1000 seconds
plot(tail(data2$seconds,1000), tail(data2$H2O,1000))
## plots seconds vs water concentration for the last 1000 seconds
data2$seconds = seq(1:nrow(data2))
## resets the seconds column to start at 1
#####################################################################
### Section 2 - Calculate slope & pick peaks
#####################################################################
h2o_slope <- function(data,window){
# h2o_slope will calculate the slope of h2o vs.
# experiment second in a moving window
# data is a dataframe such as the one created
# by the peaks.read function
# window is a number that represents the
# number of seconds the slope will be calculated
# for
H2O_Slope_Window <- numeric()
# creates an empty vector that will be populated
# by the following for loop
for (i in 1:nrow(data)){
if(i<=window/2){
H2O_Slope_Window[i] =
as.numeric(coef(lm(data$H2O[1:(i+window/2)]~
data$seconds[1:(i+window/2)]))[2])
# for seconds at the beginning of the dataset that
# are less than half the window the slope is calculated
# for the first set of seconds that represent half the
# slope window
}else{
if(i >= (nrow(data)-window/2)){
H2O_Slope_Window[i] =
as.numeric(coef(lm(data$H2O[(i-window/2):nrow(data)]~
data$seconds[(i-window/2):nrow(data)]))[2])
# for seconds at the end of the dataset
# the slope is calculated for the last set of seconds
# that represent half the slope window
}else{
H2O_Slope_Window[i] =
as.numeric(coef(lm(data$H2O[(i-window/2):(i+window/2)]~
data$seconds[(i-window/2):(i+window/2)]))[2])
# for all other seconds, the slope is calculated for
# the range of seconds +/- half the window around that second
}
}
}
return(H2O_Slope_Window)
}
data2$h2o_slope <- h2o_slope(data = data2, window = 12)
# using the h2o_slope function, this creates a column in the
# data2 dataframe with the slope of h2O v. seconds for a moving
# window of the given length. This will take a very long time
# (about 10 minutes to process 287 peaks)
cutoff <- 1000
# set the slope cutoff you want to use to determine your peaks
# the numbers set here for cutoff, setback, peak_length,
# and minimum were optimized to be as similar as possible to
# the data generated by the Picarro machine when it picks the peaks
setback <- 5
# set the number of values you want to remove from the end of your peak
peak_length <- 163
# set the number of seconds you want to average values for on your peak
minimum <- 19000
# set the minimum H2O concentration you want to use to determine your peaks
peaks <- data2[abs(data2$h2o_slope) < cutoff & data2$H2O > minimum ,]
# subsets the dataframe by your cutoff and minimum values
end <- c(which(diff(peaks$seconds) >30), nrow(peaks))
# determines the second at which each peak ends
last <- peaks$seconds[end]-setback
# creates a vector that will be the ending cutoff second for each peak
# including the setback
first <- last-peak_length
# creates a vector that will be the beginning cutoff for each peak
peaknumber <- numeric(nrow(data2))
# creates an empty vector that will be populated
# by the following for loop
for (i in 1:length(last)){
peaknumber[(first[i]):(last[(i)])] = i}
# populates peaknumber vector so that it can be appended to
# data2 to indicate which rows belong to
# each peak
data2$peaknumber <- peaknumber
#appends peaknumber to data2 as column peaknumber
#####################################################################
### Section 3 - Plot & review data
#####################################################################
## using the plots generated below, visualize the data to ensure you
## have picked appropriate areas to analyze. You may need to change
## the xlim (minimum x value to display on plot) to accomodate your
## data. These plots show only the first 6000 seconds of analysis.
## This section of code will generate a jpeg of the plotted output
## in your working directory called peaks.jpg
jpeg("peaks.jpg", height = 2080, width = 2480, units = "px",
quality = 100)
## create jpeg file to write the plots to
par(mfrow = c(3, 1))
## set plotting area to display 3 plots
plot(data2$seconds, data2$H2O, ylim = c(0,30000), xlim = c(0, 6000),
xlab = "seconds", ylab = expression('H'[2]*'O'))
## plots seconds vs. h2o concentration for the first 6000 seconds &
## up to 30000 seconds
par(new = T)
## sets graph to plot a second dataset on the same plot
plot(data2$seconds[data2$peaknumber>0], data2$H2O[data2$peaknumber>0],
ylim = c(0,30000),
xlim = c(0, 6000), col = "red", axes = F, xlab = "", ylab = ""
, pch = 16)
## plots the seconds that have been identified as the
## peaks in the color red
plot(data2$seconds[1:6000], data2$Delta_18_16[1:6000],
ylim = c(-20,20), xlim = c(0,6000), xlab = "seconds",
ylab = "d18O")
## plots seconds vs. d18O data for the first 6000 seconds & a
## range of d18O values from -20 to 20 per mil
par(new = T)
## sets graph to plot a second dataset on the same plot
plot(data2$seconds[data2$peaknumber>0],
data2$Delta_18_16[data2$peaknumber>0], ylim = c(-20,20),
xlim = c(0,6000),col = "red", axes = F, xlab = "", ylab = "")
## plots the seconds that have been identified as the
## peaks in the color red
plot(data2$seconds[1:6000], data2$Delta_D_H[1:6000],
ylim = c(-130, 30), xlim = c(0, 6000),
xlab = "", ylab = "d2H")
## plots seconds vs. d2H data for the first 6000 seconds & a
## range of d18O values from -130 to 30 per mil
par(new = T)
## sets graph to plot a second dataset on the same plot
plot(data2$seconds[data2$peaknumber>0],
data2$Delta_D_H[data2$peaknumber>0],
ylim = c(-130, 30), xlim = c(0, 6000),
col = "red", axes = F, xlab = "", ylab = "")
## plots the seconds that have been identified as the
## peaks in the color red
dev.off()
## shuts down the plotting device
#####################################################################
### Section 4 - average & write data to csv
#####################################################################
final <- aggregate(data2[data2$peaknumber>0,4:5],
list(Peak = data2$peaknumber[data2$peaknumber>0]),
mean)
#averages the values for each peak
write.csv(final,"peak_output.csv")
#writes the final dataframe to a csv in your working directory
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.