R/prepare.data.R
In MAR1: Multivariate Autoregressive Modeling for Analysis of Community Time-Series Data

Documented in prepare.data

##  Prepares plankton abundance time-series data for MAR analysis:
	#  1) aggregate by month
	#  2) fill in gaps less than or equal to 'fill.gap' months long via linear interpolation
	#  3) mark blocks of consecutive time-steps
	#  4) change zeros to random number less than 1/2 lowest non-zero value 
		# OR add 1 to all values
	#  5) log10 transform
	#  6) zscore values where z = (value-taxon.mean)/taxon.stdev
		# OR z = value-taxon.increment.mean/taxon.increment.stdev (de-seasons data)



prepare.data <- function(data, increment=c("month","year","week","day"),
					fill.gap=0, replace.0s=c(FALSE,"rand.half","add.ones"), 
					log=FALSE, z.method=c(FALSE,"standard","deseason"),order=NULL) {

##  Argument values:
	#  x = dataframe to be transformed; first column dates, following columns plankton taxa
	#  increment = time-step increment
	#  fill.gap = maximum length of time-step gap to be filled by linear interpolation
	#  replace.0s = "rand.half" OR "add.ones" (in quotes) OR FALSE
			# "rand.half" replaces 0 values with a random value less than 1/2 taxon minimum
			# "add.ones" adds 1 to all values
			# FALSE leaves zeros in data
	#  log = should data be log-transformed?
	#  z.method = "standard" OR "deseason" (in quotes) OR FALSE
			# "standard" subtracts the taxon mean and divides by the taxon standard deviation
			# "deseason" does the same, but the means and standard deviations are month-specific 
					# to remove seasonality
			# FALSE does not apply a z-score to the data
	#  order = date order; "dmy" or "ymd" or "mdy" (must be in quotes); function will attempt
		# to distinguish date order on its own if this argument is not provided

increment<-increment[1]
replace.0s<-replace.0s[1]
z.method<-z.method[1]

# remove any columns that contain only zeros
data1<-data
if(any(apply(data[,-1],2,mean,na.rm=T)==0)){
data1<-data[,-(which(apply(data[,-1],2,mean,na.rm=T)==0)+1)]
cat(paste(length(which(apply(data[,-1],2,mean,na.rm=T)==0)),"columns containing all zeros omitted\n"))
}

#====================================================================================
# 1) FORMAT DATES AND AGGREGATE TAXA BY INCREMENT:
#====================================================================================

dates<-as.character(data1[,1])
d<-strsplit(dates[1],"")[[1]]
sep<-unique(d[is.na(suppressWarnings(as.numeric(d)))])
if(length(sep)!=1) stop("Unrecognized date format. Make sure dates are in first column 
  and use consistent non-numeric seperator between year, month, and day.")
d<-data.frame(apply(t(data.frame(strsplit(dates,sep))),2,as.numeric))

if(is.null(order)){
if(length(which(apply(d,2,max)<13))>1) stop("Cannot distinguish between days and months in dates.
	Provide 'order' argument in function call.")
names(d)[which(apply(d,2,max)>12)]<-"d"
names(d)[which(apply(d,2,max)<13)]<-"m"
names(d)[which(apply(d,2,max)>100)]<-"y"
} else {names(d)<-strsplit(order,"")[[1]]}

data2<-data.frame(date=as.Date(paste(d$y,d$m,d$d,sep="-")),data1[,-1])
data2<-aggregate(data2[,-1],by=list(data2[,1]),mean,na.rm=T)
names(data2)[1]<-"date"
date<-seq.Date(min(data2[,1]),max(data2[,1]),1)
data2<-merge(data.frame(date=date),data2,all=T)
taxa <- data2[,-1]

if(increment=="month") t<-"%Y-%m"
if(increment=="year") t<-"%Y"
if(increment=="week") t<-"%Y-%U"
if(increment=="day") t<-"%Y-%j"

tstep=as.character(date,t)

byinc <- aggregate(data2,by=list(tstep=tstep),mean,na.rm=T)
byinc.taxa<-byinc[,-c(1:2)]

if(increment=="day") incremently<-"daily" else incremently<-paste(increment,"ly",sep="")
cat(paste("data successfully aggregated into",incremently,"time steps\n"))

#====================================================================================
# 2) FIND GAPS AND FILL IN THOSE <= fill.gap:
#====================================================================================

missing<-is.na(apply(byinc.taxa,1,mean))  # where are values missing? 1=yes 0=no

byinc2<-byinc

if(any(missing)&fill.gap>0){

# gets lengths of 1 sequences
lengths.miss <- rle(missing)$lengths[which(rle(missing)$values==T)]
# get indices of all missing values
which.miss <- which(missing)
# get starting row of each missing sequence
start.miss <- which.miss[cumsum(c(1,lengths.miss))[-(length(lengths.miss)+1)]]

# focus on missing sequences less than 'fill.gap' value
start.miss<-start.miss[lengths.miss<=fill.gap]
lengths.miss<-lengths.miss[lengths.miss<=fill.gap]

# replace the 'm' missing value sequences in byinc2 that are <= 'fill.gap' 
	# with linearly interpolated values for each 'j' taxa

if(length(lengths.miss)>0){
for(m in 1:length(lengths.miss)){
for(j in 3:ncol(byinc)){

byinc2[start.miss[m]:(start.miss[m]+lengths.miss[m]-1),j]<-
approx(	x=c(1,lengths.miss[m]+2),	# x=gap length plus 2 endpoints
		# y=abundance values on either side of gap
		y=byinc[c(start.miss[m]-1,start.miss[m]+lengths.miss[m]),j],
		# xout=get interpolation for the missing values
		xout=2:(lengths.miss[m]+1))$y  

}}}

cat(paste(length(lengths.miss),"time gaps <=",fill.gap,"interpolated\n"))

}

#====================================================================================
# 3) MARK BLOCKS OF CONTINUOUS TIME:
#====================================================================================

contin <- 1
for(i in 1:nrow(byinc2)){
if(any(is.na(byinc2[i,])))
contin<-c(contin,contin[i]+1) 
else contin<-c(contin,contin[i])
}

byinc2<-cbind(contin=contin[-1],byinc2) # add contin column to data
if(!any(duplicated(byinc2$contin))){
	cat(paste("NO CONTINUOUS TIME-STEPS IN AGGREGATED SERIES...  Returning",incremently,"data\n\n"))
	return(byinc2[,-2])
	}
byinc3<-na.omit(byinc2)  # strip NA rows from data
cat(paste("Time-series contains",length(which(duplicated(byinc3$contin))),"continuous time-steps\n"))

# remove any columns that contain only zeros
if(any(apply(byinc3[,-c(1:3)],2,mean,na.rm=T)==0))
byinc3<-byinc3[,-(which(apply(byinc3[,-c(1:3)],2,mean,na.rm=T)==0)+3)]

#====================================================================================
# 4) REPLACE ZEROS IN DATA:
#====================================================================================

if(log&replace.0s==F&any(byinc3==0)) 
stop("Zeros must be replaced before data are log-transformed.")

if(replace.0s!=F&any(byinc3<0))
stop("Negative values in data indicate values may already be log-transformed.  Cannot replace zeros.")

no.zeros <- byinc3

#=======================================
	## a)  TO RANDOM NUMBERS LESS THAN 1/2 THE LOWEST VALUE FOR EACH TAXON
#=======================================

if (replace.0s=="rand.half"){

for (j in 4:ncol(no.zeros)){
	nonzeros <- no.zeros[no.zeros[,j]>0,j]
	for (i in 1:nrow(no.zeros)){
		if (no.zeros[i,j]==0) no.zeros[i,j] <- runif(1,0,min(nonzeros)/2) }}
}

#=======================================
	## b)  BY ADDING 1 TO ALL VALUES
#=======================================

if (replace.0s=="add.ones"){
no.zeros <- data.frame(no.zeros[,1:3],no.zeros[,4:ncol(no.zeros)]+1)
}

if(replace.0s!=FALSE) cat(paste(sum(byinc3==0),"zeros replaced\n")) #else 
#cat(paste(sum(byinc3==0),"zeros left in data\n"))

#====================================================================================
# 5) LOG10 TRANSFORM VALUES:
#====================================================================================

if(log&any(no.zeros[,4:ncol(no.zeros)]<0)){
	stop("Data contain negative values. Cannot log-transform.")}

if(log){
logged <- data.frame(no.zeros[,1:3],log10(no.zeros[,4:ncol(no.zeros)]))
cat("data successfully log-transformed\n")} else logged<-no.zeros

#====================================================================================
# 6) Z-SCORE VALUES:
#====================================================================================

#=======================================
	## a)  AND REMOVE SEASONALITY
#=======================================

if(z.method==FALSE) zscores<-logged

if(z.method=="deseason"&increment=="year"){
	warning("Cannot deseason yearly values.  Using standard z-score method.",
			call.=F,immediate.=T)
	z.method<-"standard"}

if (z.method=="deseason"){

# remove year part of tstep to get general increment values
overall.inc<-substr(logged$tstep,6,nchar(logged$tstep))
deseason.zeros<-length(unique(overall.inc))/length(overall.inc)
if(deseason.zeros>.25) warning(paste(
	"Using the deseasoning z-score method has generated a large 
	      number of zeros due to many unique sampling",increment,"values."),
	call.=F,immediate.=F)

# create a data frame of overall increment means 'oim'
oim <- data.frame(aggregate(logged[,-c(1:3)], by=list(increment=overall.inc), mean))

# create a data frame of overall increment standard deviations 'oisd'
oisd <- data.frame(aggregate(logged[,-c(1:3)], by=list(increment=overall.inc), sd))  

oisd[is.na(oisd)]<-1  # sds will be NA if only one occurence of overall value
oisd[oisd==0]<-1  # sds will be 0 if multiple occurences of overall value are equal

zscores <- NULL

for(i in 1:nrow(logged)) {
  inc <- overall.inc[i]
  (logged[i,-c(1:3)]-oim[which(oim$increment==inc),-1])/oisd[which(oisd$increment==inc),-1]->zscore
  zscores<-rbind(zscores,zscore)}

zscores <- data.frame(logged[,1:3],zscores)

cat("data successfully z-scored with deseasoning\n")
}

#=======================================
	## b) WITHOUT REMOVING SEASONALITY
#=======================================

if (z.method=="standard"){

om <- apply(data.frame(logged[,-c(1:3)]),2,mean)  # get overall means for each taxon
osd <- apply(data.frame(logged[,-c(1:3)]),2,sd)  # get overall standard deviations for each taxon

zscores <- logged[,1:3]

for(j in 4:ncol(logged)) {
  zscores<-data.frame(zscores,(logged[,j]-om[j-3])/osd[j-3]) }

names(zscores)<-names(logged)

cat("data successfully z-scored\n")
}

zscores<-zscores[,-2]

zscores
}

Any scripts or data that you put into this service are public.

MAR1 documentation built on May 29, 2017, 4:18 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

MAR1
Multivariate Autoregressive Modeling for Analysis of Community Time-Series Data

R/prepare.data.R
In MAR1: Multivariate Autoregressive Modeling for Analysis of Community Time-Series Data

Defines functions prepare.data

Documented in prepare.data

Try the MAR1 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MAR1 Multivariate Autoregressive Modeling for Analysis of Community Time-Series Data

R/prepare.data.R In MAR1: Multivariate Autoregressive Modeling for Analysis of Community Time-Series Data

Defines functions prepare.data

Documented in prepare.data

Try the MAR1 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MAR1
Multivariate Autoregressive Modeling for Analysis of Community Time-Series Data

R/prepare.data.R
In MAR1: Multivariate Autoregressive Modeling for Analysis of Community Time-Series Data