CRDSutils: Picarro L21X0-i Utilities

#####################################################################
###                      Peak Picking Script                      ###
###      The code in this file can be used to analyze a           ###
###      folder with dat files generated by the Picarro.          ###
###      First it reads in all of the files in a folder           ###
###      (including subfolders). Next, it calculates the          ###
###      slope for h2o vs. time. Then it picks all of the         ###
###      peaks. It saves a plot showing the picked peaks.         ###
###      Finally, it averages the isotope values for              ###
###      each peak. This script is meant to be run in             ###
###      sections.                                                ###
#####################################################################

#####################################################################
###   Section 1 - Organize your data to be analyzed
#####################################################################

setwd("") 
## update this to your working directory where you would like to 
## the plots & csv generated by this script to be saved
## e.g. "C:/Users/u0989124/Dropbox/SPATIAL lab"

folder <- ""
## update this with the filepath to folder that contains the dat 
## files to be analyzed eg. 
## "C:/Users/u0989124/Dropbox/SPATIAL lab/hids2052/150209"

peaks.read <- function(filepath){
  ## peaks.read is a function that will read files from all 
  ## sub-folders of the directory named in filepath and combine them 
  ## into one dataframe
  
  	files <-  list.files(path=filepath, recursive = T,
                         full.names = T) 
	## lists the file names in the filepath directory
  	tables <- lapply(files, read.table,header = T, 
                     stringsAsFactors = F) 
	## reads in the files
  	df <- do.call("rbind",tables) 
  ## combines all the tables into one dataframe
  	df2 <- df[,c(1:2,17:19)] 
  ## subsets the dataframe to include only necessary columns
  	df3 <- df2[complete.cases(df2),] 
  ## subsets the dataframe to include only complete cases, 
	## eliminating rows at the end that aren't complete
  	return(df3)
	} 

data <- peaks.read(folder) 
	## using the peaks.read function, this reads in all of the 
	## data and puts it into a single dataframe while making  
	## some modifications to the dataframe to make it easier  
	## to manipulate. This will take awhile since there is a 
  ## lot of data to be processed.

data$seconds <- as.numeric(row.names(data)) 
	## creates a column that gives the second that each row was 
	## recorded from 1 to the end of the dataframe

plot(data$seconds, data$H2O) 
	## this plot should help you visualize the data so you 
	## know how to modify or subset it. you will want to subset 
	## the data to include only the section with the peaks 
	## you want to analyze.

data2 <- data[data$seconds > (head(data$seconds[data$H2O > 17000],1)-30) & 
                data$seconds < (tail(data$seconds[data$H2O > 17000],1)+30),] 
## this may be a useful way to subset your data because it starts 
## 30 seconds before the first peak and ends 30 seconds after 
## the last peak. It also creates a copy of data so that if you need 
## to go back to the original data you don't have to read it in again 

plot(data2$seconds, data2$H2O) 
## plots seconds vs water concentration, review this along
## with the next two plots to determine that the data have
## been subsetted appropriately

plot(data2$seconds[1:1000], data2$H2O[1:1000])
## plots seconds vs water concentration for the first 1000 seconds

plot(tail(data2$seconds,1000),  tail(data2$H2O,1000))
## plots seconds vs water concentration for the last 1000 seconds 

data2$seconds = seq(1:nrow(data2)) 
## resets the seconds column to start at 1

#####################################################################
###   Section 2 - Calculate slope & pick peaks
#####################################################################

h2o_slope <- function(data,window){
  # h2o_slope will calculate the slope of h2o vs. 
  # experiment second in a moving window  
  # data is a dataframe such as the one created
  # by the peaks.read function
  # window is a number that represents the
  # number of seconds the slope will be calculated
  # for
  
  H2O_Slope_Window <- numeric()
  # creates an empty vector that will be populated
  # by the following for loop
  
  for (i in 1:nrow(data)){    
    if(i<=window/2){                                                                   
      H2O_Slope_Window[i] = 
        as.numeric(coef(lm(data$H2O[1:(i+window/2)]~
		data$seconds[1:(i+window/2)]))[2])
    # for seconds at the beginning of the dataset that
    # are less than half the window the slope is calculated
    # for the first set of seconds that represent half the 
    # slope window
    
    }else{
      if(i >= (nrow(data)-window/2)){
        H2O_Slope_Window[i] = 
          as.numeric(coef(lm(data$H2O[(i-window/2):nrow(data)]~
		data$seconds[(i-window/2):nrow(data)]))[2])
    # for seconds at the end of the dataset 
    # the slope is calculated for the last set of seconds
    # that represent half the slope window
    
      }else{
        H2O_Slope_Window[i] = 
          as.numeric(coef(lm(data$H2O[(i-window/2):(i+window/2)]~
		data$seconds[(i-window/2):(i+window/2)]))[2])   
    # for all other seconds, the slope is calculated for
    # the range of seconds +/- half the window around that second
      }
    }
  }
  return(H2O_Slope_Window)
} 

data2$h2o_slope <- h2o_slope(data = data2, window = 12) 
	# using the h2o_slope function, this creates a column in the 
	# data2 dataframe with the slope of h2O v. seconds for a moving 
	# window of the given length. This will take a very long time 
  # (about 10 minutes to process 287 peaks)

cutoff <- 1000
# set the slope cutoff you want to use to determine your peaks
# the numbers set here for cutoff, setback, peak_length,
# and minimum were optimized to be as similar as possible to 
# the data generated by the Picarro machine when it picks the peaks

setback <- 5
# set the number of values you want to remove from the end of your peak

peak_length <- 163
# set the number of seconds you want to average values for on your peak

minimum <- 19000 
# set the minimum H2O concentration you want to use to determine your peaks

peaks <- data2[abs(data2$h2o_slope) < cutoff & data2$H2O > minimum ,]
# subsets the dataframe by your cutoff and minimum values

end <- c(which(diff(peaks$seconds) >30), nrow(peaks)) 
# determines the second at which each peak ends

last <- peaks$seconds[end]-setback 
# creates a vector that will be the ending cutoff second for each peak
# including the setback

first <- last-peak_length
# creates a vector that will be the beginning cutoff for each peak

peaknumber <- numeric(nrow(data2)) 
# creates an empty vector that will be populated
# by the following for loop

for (i in 1:length(last)){ 
    peaknumber[(first[i]):(last[(i)])] = i}
	# populates peaknumber vector so that it can be appended to 
  # data2 to indicate which rows belong to
	# each peak
data2$peaknumber <- peaknumber
	#appends peaknumber to data2 as column peaknumber

#####################################################################
###   Section 3 - Plot & review data
#####################################################################

## using the plots generated below, visualize the data to ensure you 
## have picked appropriate areas to analyze. You may need to change 
## the xlim (minimum x value to display on plot) to accomodate your 
## data. These plots show only the first 6000 seconds of analysis.  
## This section of code will generate a jpeg of the plotted output 
## in your working directory called peaks.jpg

jpeg("peaks.jpg", height = 2080, width = 2480, units = "px",
     quality = 100)
## create jpeg file to write the plots to

par(mfrow = c(3, 1))
## set plotting area to display 3 plots

plot(data2$seconds, data2$H2O, ylim = c(0,30000), xlim = c(0, 6000), 
xlab = "seconds", ylab = expression('H'[2]*'O'))
## plots seconds vs. h2o concentration for the first 6000 seconds &
## up to 30000 seconds
par(new = T)
## sets graph to plot a second dataset on the same plot
plot(data2$seconds[data2$peaknumber>0], data2$H2O[data2$peaknumber>0], 
     ylim = c(0,30000), 
xlim = c(0, 6000), col = "red", axes = F, xlab = "", ylab = ""
, pch = 16)
## plots the seconds that have been identified as the 
## peaks in the color red

plot(data2$seconds[1:6000], data2$Delta_18_16[1:6000], 
     ylim = c(-20,20), xlim = c(0,6000), xlab = "seconds",
     ylab = "d18O") 
## plots seconds vs. d18O data for the first 6000 seconds & a 
## range of d18O values from -20 to 20 per mil
par(new = T)
## sets graph to plot a second dataset on the same plot
plot(data2$seconds[data2$peaknumber>0], 
     data2$Delta_18_16[data2$peaknumber>0], ylim = c(-20,20), 
     xlim = c(0,6000),col = "red", axes = F, xlab = "", ylab = "")
## plots the seconds that have been identified as the 
## peaks in the color red

plot(data2$seconds[1:6000], data2$Delta_D_H[1:6000],
     ylim = c(-130, 30), xlim = c(0, 6000), 
     xlab = "", ylab = "d2H")
## plots seconds vs. d2H data for the first 6000 seconds & a 
## range of d18O values from -130 to 30 per mil
par(new = T)
## sets graph to plot a second dataset on the same plot
plot(data2$seconds[data2$peaknumber>0], 
     data2$Delta_D_H[data2$peaknumber>0], 
     ylim = c(-130, 30), xlim = c(0, 6000), 
     col = "red", axes = F, xlab = "", ylab = "")
## plots the seconds that have been identified as the 
## peaks in the color red

dev.off()
## shuts down the plotting device

#####################################################################
###   Section 4 - average & write data to csv
#####################################################################

final <- aggregate(data2[data2$peaknumber>0,4:5],
                   list(Peak = data2$peaknumber[data2$peaknumber>0]),
                   mean)
	#averages the values for each peak

write.csv(final,"peak_output.csv")
	#writes the final dataframe to a csv in your working directory