R/bat_aggregate.R

Defines functions bat_aggregate

# Aggreate function for data ----------------------------------------------

bat_aggregate <- function(bat) {
  bat <- bat[ , c("Country", "SiteName", "SampleDate", "Positive", "Latitude", "Longitude")]

  # u = df %>% select(SiteName, Latitude, Longitude) %>% distinct()
  # u
  #
  # u[ duplicated(u[ , c("Latitude", "Longitude")]), ]
  #   df$N = 1
  #   dfI = aggregate(cbind(Positive, N)~ SiteName + SampleDate + Latitude + Longitude, data=df, FUN=sum)
  #   aggSite = aggregate(cbind(A, P) ~ SiteName + SampleDate + Latitude + Longitude, data = df, FUN = sum)
  #

  locations = unique(bat[ ,c("Country", "Latitude", "Longitude")])
  locations$coord_id = 1:nrow(locations)
  # locations
  newbat = merge(x = bat, y = locations, by = c("Latitude", "Longitude"))


  dateChk = table(newbat$coord_id, newbat$SampleDate)
  dateChk = ifelse(dateChk > 0, 1, 0)
  dateDouble = rowSums(dateChk)
  # names(dateDouble)[dateDouble > 1]
  # sum(dateDouble > 1)
  dateDouble.df = data.frame(coord_id = as.numeric(names(dateDouble)), visit_num = dateDouble)

  # Note: There are 20 sampling locations that have been revisited, be it months
  # apart, or days apart.

  # Hence, now,  will create a dataframe, which contais data about sampling location
  # ids, the no. of times visited, and the aggregate positives at the site
  # Question arises about when revisits have been conducted under different conditions.
  # For that, I'll create a table with sample dates collapsed and separated with ;


  sites = unique(newbat[ , c("coord_id", "SiteName")])
  dup_sites = which(table(sites$coord_id) > 1)
  corr_sites = lapply(dup_sites, function(site, df) {
    subset = df[ df$coord_id == site, ]
    new_name = paste0(subset$SiteName, collapse = " | ")
    data.frame(coord_id = site, SiteName = new_name, stringsAsFactors = FALSE)
  }, sites)
  corr_sites = do.call(rbind, corr_sites)
  for(i in 1:nrow(corr_sites)) {
    sites$SiteName[ sites$coord_id %in% corr_sites$coord_id[i] ] = corr_sites$SiteName[i]
  }
  sites = unique(sites)

  dates = lapply(locations$coord_id, function(id, df) {
    subset = sort(unique(df$SampleDate[ df$coord_id == id]))
    dates = paste0(subset, collapse = "; ")
    month_year = unique(format(subset, format = "%b-%Y"))
    month_year = paste0(month_year, collapse = "; ")
    coord_id = id
    date_df = data.frame(dates = dates, month_year = month_year, coord_id = coord_id)
  }, newbat)
  dates = do.call(rbind, dates)


  sampled = table(newbat$coord_id)
  sampled_no = data.frame(sampled_no = as.vector(sampled), coord_id = as.numeric(names(sampled)))
  pos = table(newbat$coord_id, newbat$Positive)[ ,2]
  pos_no = data.frame(pos_no = pos, coord_id = as.numeric(names(pos)))
  visits_no = data.frame(visits_no = dateDouble.df[ ,2], coord_id = dateDouble.df[ ,1])

  bf = Reduce(function(x, y) merge(x, y), list(locations, sites, sampled_no, pos_no, visits_no, dates))
  bf$Positive = ifelse(bf$pos_no > 0, 1, 0)
  bf$prop = bf$pos_no/bf$sampled_no
  return(bf)
}
nistara/eidithR documentation built on May 23, 2017, 2:54 p.m.