smosfiles: keeps files

library(dplyr)
library(zoo)
library(reshape2)
library(devtools)
library(lubridate)

### New Data ###

folder <- "../smosfiles/data-raw/fullyear/"
files = list.files(path = folder,
                   pattern = "*.csv")
filenames <- gsub("\\.csv.*","",files)

for (i in 1:length(files)){
  assign(filenames[i],
         read.csv(paste(folder, files[i], sep=''),header=TRUE)
  )}

#Put All 30 Data Frames into a List
list_df = lapply(filenames, get)

#Make Generic Matrix to Fill in Missing Days
day  <- seq(1,365)
year <- rep(2010,365)
c <- data.frame(day,year)
yplus <- c(2011, 2012, 2013, 2014, 2015, 2016)
for (j in 1:6){
  year <- rep(yplus[j],365)
  c <- rbind(c, data.frame(day,year))
}

pixfully <- vector("list", 30)

for (i in 1:30){
  pix <- list_df[[i]]
  pix <- pix %>% filter(year %in% c(2010, 2011, 2012, 2013, 2014, 2015, 2016))
  pix$day <- as.integer(pix$doy)
  pix$day[pix$day<1] <- 1
  pix$tau[pix$tau<=0] <- NA
  pix$tau[pix$tau=="NaN"] = NA
  pixsub <- pix %>% group_by(year,day) %>% select(year,tau,day)
  pixsub <- pixsub %>% group_by(year,day) %>% summarise(y = mean(tau,na.rm=TRUE))
  pixsub$y[pixsub$y=="NaN"] = NA
  pixfin <- left_join(c,pixsub,by=c("year"="year","day"="day"))
  loc <- rep(pix$dgg[1],2555)
  lat <- rep(pix$lat[1],2555)
  lon <- rep(pix$lon[1],2555)
  pixfin$loc <- loc
  pixfin$lat <- lat
  pixfin$lon <- lon
  pixfully[[i]] <- pixfin
}


#Turn Into Data Objects
devtools::use_data(pixfully, overwrite = TRUE, compress="xz")



### Old Data Cleaning Process ###

folder <- "../smosfiles/data-raw/raw/"
files = list.files(path = folder,
                   pattern = "*.csv")
filenames <- gsub("\\.csv.*","",files)

for (i in 1:length(files)){
  assign(filenames[i],
         read.csv(paste(folder, files[i], sep=''),header=TRUE)
  )}

list_df = lapply(filenames, get)


#Get a List of Pixel Names and Grab All 180 files
seqpix <- seq(1,180,by=6)
namespix = filenames[seqpix]
spl = strsplit(namespix, "_",fixed=TRUE)
namesfin = unlist(lapply(spl,FUN=function(x){paste(x[1],x[2],sep="_")}))
list_df = lapply(filenames, get)

#L1 Cleaning: For Double Counted Times, keep the L2 Version of Tau

ls <- NULL #Count the number of Duplicates We Are Removing

nodupsdat <- list()

for(i in 1:180){
  tmp <- list_df[[i]] %>% add_rownames()
  dups <- tmp %>% group_by(doy) %>% filter(n()>=2) %>% filter(L1Cv==1) %>% filter(tau>0)
  index <- as.integer(dups$rowname)
  ls <- append(ls,index)
  if (length(index)>0)
    {nodupsdat[[i]] <- tmp[-index,]}
  else{
    nodupsdat[[i]] <- tmp
  }

}


#Group Files by Location.  Return a List of 30 Pixels
locpix <- list()
for (i in 0:29){
  lower = 1+i*6
  upper = 6+i*6
  locpix[[i+1]] <- lapply(nodupsdat[lower:upper], as.data.frame)
}



#Make DataFrame with Locations of Pixels

maps2=lapply(locpix,"[[", 1,drop=FALSE)
maps3=lapply(maps2, "[", 1,c(3:5))
anna = do.call(rbind.data.frame, maps3)


#Add Column of Integer Dates
intdat <- vector("list", length(locpix))
for (j in 1:30){
  intdat[[j]] <- lapply(locpix[[j]],function(x) transform(x, doyint = as.integer(x[,2])))
}

#Return Back to 180 Files
unlistdat <- unlist(intdat, recursive = FALSE)

#Drop Obs with Tau<=0
Tpos <- vector("list", length(unlistdat))
for(i in 1:180){
  Tpos[[i]] <- unlistdat[[i]][unlistdat[[i]]$tau>0,]
}

#Make 2 Lists of 30 pixels: 1 with Filter for Corrupted Signal Filter, and One Without

#Remove Obs if either IRRFI or Chi_2P > .05.  Very strict threshold

noisefree <- list()

rm <- NULL

for (i in 1:180){
  noisefree[[i]] <- subset(Tpos[[i]],IRRFI <= .05 & Chi_2P <= .05)
  diff <- nrow(Tpos[[i]]) - nrow(sub)
  rm <- append(rm, diff)
}

#Add Indicator variable for Noise Obs
noiseind <- list()
for (i in 1:180){
  noiseind[[i]] <- Tpos[[i]] %>% mutate(corrupt = ifelse(IRRFI>=.05 | Chi_2P >=.05,1,0))
  }


#Average Tau Over duplicate Days
Nodup <- lapply(Tpos, function(x) aggregate(tau~doyint,data=x,FUN = mean))


#Aggregate Over Indicator for Noise
FlagIndicator <- lapply(noiseind, function(x) aggregate(corrupt~doyint,data=x,FUN = mean))

#If Multiple Obs Per Interger Day (corrupt=.5), round down to zero
FlagIndicator<-lapply(FlagIndicator,function(x) as.data.frame(cbind(x[,1],ifelse(x[,2]<1,0,1))))

#Turn all 180 files to Zoo Time Series Objects
pix.zoo <- lapply(Nodup, read.zoo, index=1)

pix.zooInd <- lapply(FlagIndicator, read.zoo, index=1)

#print(pix.zoo[[1]],style = "vertical")

#Use Zoo to Fill In Gaps so All Years Have Same Index
wide <- list()
for (i in 0:29){
  lower <- 1+i*6
  upper <- 6+i*6
  l1 <- pix.zoo[lower:upper]
  unionl1 <- Reduce(merge,l1)
  g <- zoo(,seq(start(unionl1),end(unionl1)))
  z <- merge(unionl1,g)
  wide[[i+1]] <- z
}

wideNF <- list()
for (i in 0:29){
  lower <- 1+i*6
  upper <- 6+i*6
  l1 <- pix.zooInd[lower:upper]
  unionl1 <- Reduce(merge,l1)
  g <- zoo(,seq(start(unionl1),end(unionl1)))
  z <- merge(unionl1,g)
  wideNF[[i+1]] <- z
}

#Note to Self: Let's Try and Use lapply more rather than loops.

wideNF <- lapply(wideNF, function (x) {
  names(x) <- c("2010","2011", "2012","2013","2014","2015")
  x
  })

wide <- lapply(wide, function (x) {
  names(x) <- c("2010","2011", "2012","2013","2014","2015")
  x
})

#Add Index Column.  Note: NF= Noise Filter

wideNF <- lapply(wideNF, function (x) {
  x <- add_rownames(as.data.frame(x),"VALUE")
  x
})

wide <- lapply(wide, function (x) {
  x <- add_rownames(as.data.frame(x),"VALUE")
  x
})

#Truncated Year's Worth of Data

notfullyear <- wide

#Split 275 & 276 Days
pixall <- seq(1,30,1)
xday <- c(5,7,8,10,12,17,18,23,25)
dayplus <- pixall[-xday]

#Fill in NA to Make a Full Year for 276 Day Years
for (j in dayplus){
  for (i in 58:1){
    wide[[j]] <- rbind(c(i,NA,NA,NA,NA,NA,NA),wide[[j]])
  }
  for (i in 335:365){
    wide[[j]] <- rbind(wide[[j]],c(i,NA,NA,NA,NA,NA,NA))
  }
}

#Fill in NA to Make a Full Year for 275 Day Years
for (j in xday){
  for (i in 59:1){
    wide[[j]] <- rbind(c(i,NA,NA,NA,NA,NA,NA),wide[[j]])
  }
  for (i in 335:365){
    wide[[j]] <- rbind(wide[[j]],c(i,NA,NA,NA,NA,NA,NA))
  }
}


#Switch Data from Wide to Long Format
pix.long <- vector("list", length(wide))
pix.short <- vector("list", length(notfullyear))
pix.shortNF <- vector("list", length(wideNF))

for (i in 1:30){
  pix.long[[i]] <- melt(wide[[i]],id.vars="VALUE",variable.name="year",value.name="T")
  names(pix.long[[i]])[names(pix.long[[i]])=="VALUE"] <- "day"

  pix.short[[i]] <- melt(notfullyear[[i]],id.vars="VALUE",variable.name="year",value.name="T")
  names(pix.short[[i]])[names(pix.short[[i]])=="VALUE"] <- "day"

  pix.shortNF[[i]] <- melt(wideNF[[i]],id.vars="VALUE",variable.name="year",value.name="T")
  names(pix.shortNF[[i]])[names(pix.shortNF[[i]])=="VALUE"] <- "day"

}

#Check Lengths of Time Series
s.length=sapply(pix.short, function (x) length(x$T))
l.length=sapply(pix.long, function (x) length(x$T))

table(s.length)
table(l.length)

#Add Day Index Column
for (i in 1:30){
  pix.long[[i]]$cumday <- seq.int(nrow(pix.long[[i]]))
  pix.short[[i]]$cumday <- seq.int(nrow(pix.short[[i]]))
  pix.shortNF[[i]]$cumday <- seq.int(nrow(pix.shortNF[[i]]))
}

#Turn Into Data Objects
devtools::use_data(pix.shortNF, overwrite = TRUE, compress="xz")

devtools::use_data(pix.short, overwrite = TRUE, compress = "xz")

devtools::use_data(pix.long, overwrite = TRUE, compress="xz")



#Clean GDD Data

folder <- "../smosfiles/data-raw/GDD/"
files = list.files(path = folder,
                   pattern = "*.csv")
filenames <- gsub("\\.csv.*","",files)

for (i in 1:length(files)){
  assign(filenames[i],
         read.csv(paste(folder, files[i], sep=''),header=TRUE)
  )}

#Collect DataFrames into a List
list_df = lapply(filenames, get)

list_df<-lapply(
  list_df,
  function(x){
    x$daily_low_f[x$daily_low_f<50] <- 50
    x$daily_high_f[x$daily_high_f>86] <- 86
    x$daily_high_f[x$daily_high_f<50] <- 50
    x$gdd <- ((x$daily_high_f+x$daily_low_f)/2)-50
    return(x)
  }
)

#Make Year, Month, Day Seperate Columns; and add cumulative day index.
#Note 2012 was a Leap Year.

list_df <- lapply(
  list_df,
  function(x){
    x$date2 <- ymd(x$date)
    x$year <- year(x$date2)
    x$month <- month(x$date2)
    x$dat <- day(x$date2)
    x$cumday <- seq.int(nrow(x))
    return(x)
  }
)


#Grab One Pixel, 194406
pix1 <- list_df[[1]]


#Assume crops are in ground May 1 for all years
pix1 <- pix1 %>%
  mutate(planted = ifelse(date2 >"2010-05-01" & date2 <= "2010-12-31", 1,
                        ifelse(date2 > "2011-05-01" & date2 <= "2011-12-31", 1,
                        ifelse(date2 > "2012-05-01" & date2 <= "2012-12-31", 1,
                        ifelse(date2 > "2013-05-01" & date2 <= "2013-12-31", 1,
                        ifelse(date2 > "2014-05-01" & date2 <= "2014-12-31", 1,
                        ifelse(date2 > "2015-05-01" & date2 <= "2015-12-31", 1, 0)))))
  )
  )

#Not Sure Why this Isn't running cum sum over subset
#pix1 <- pix1 %>% group_by(year,planted) %>% mutate(out = ifelse(planted==1,cumsum(gdd),0))

pix1 <- ddply(pix1, .(year,planted), transform, Cumulative.Sum = cumsum(gdd))

pix1 <- pix1 %>% mutate(Cumulative.Sum = ifelse(planted==1,Cumulative.Sum,0))

#Remove Leap Year Day, Row 790
pix1 <- pix1[-790,]


#Join GDD with Tau Data
d <- smosfiles::pix.long

#Note: Not sure How to String Into One Long Series Given GDD aren't uniform acorss years.

pix1tau <- pix.long[[1]]

#Combine GDD Data Frame and Tau Data Frame for One Pixel
combo <- cbind(pix1,pix1tau)

#Get Valid Column Names
valid_column_names <- make.names(names=names(combo), unique=TRUE, allow_ = TRUE)
names(combo) <- valid_column_names

par(mfrow=c(3,2))
years <- seq(2010, 2015, by=1)

for (i in years){
combo10 <- combo %>% filter(year==i) %>% select(cumday,year, Cumulative.Sum, day, date, T)
#plot(combo10$Cumulative.Sum,combo10$T,ylab="T",xlab="GDD",main=i)
plot(combo10$day,combo10$T,ylab="T",xlab="Day",main=i)
}

plot(combo10$day,combo10$T)
plot(combo10$Cumulative.Sum,combo10$day,xlab="GDD",ylab="Day")
plot(combo10$day,combo10$Cumulative.Sum,xlab="Day",ylab="GDD")


plot(pix1tau$cumday,pix1tau$T)