knitr::opts_chunk$set(echo = TRUE)

General Workflow

Data was recieved from Steve Latta as seperate spreadsheets which were merged into a single .csv file in 0_proofing_merging....Rmd script

This present script tidies/cleans up the data.

The following script makes final changes to prepare for analysis

Note that data worked up here result in slightly different results than appear in the original reject MS; this is likely due tabulation errors during the original work. In general the rank-order of all comparisons (eg, when looking at differences between sites) remains the same

General Description of what Nathan Brouwer's data cleaning script(s) do

What follows is a brief orview of my general approach to cleaning data. This is mostly boiler plate. See 0_proofing_merging....Rmd for more details

SCRIPT 1: reshaping, tidying, and cleaning data

The following steps aren't necessarily all distinct in practice or necessarily occur in this precise order

Step 1.1: "reshaping" and "tidying"

"reshape"

taking data from the format it was entered in into the format that works best w/ R; usually reshaping from "wide" to "long" format. eg, with data from each subplot in a different column to all data in a single column with subplot number indicated in a desperate column.

"tidying"

converting into an analysis-read format. For example, splitting things like columns that combine treatment and plot number into separate columns. (treatment column, subplot column)

Step 1.2: "cleaning"

Taking reshaped data and fixing any issues just as typos, incorrect column names, data entered into wrong field, fixing species names, etc.

After cleaning and tidying, the data could be "reshaped" (or recast) back into its original format and there would still be an ~1:1 correspondence with the datasheet.

Libraries

library(here)
library(reshape2)
library(stringr)
library(stringi)
library(lubridate)

Preliminaries

Set up working directories

Use here::here()

here::here()

filename <- here::here("data-raw",
                  "data-raw-mencia",
                  "mencia_all_captures_by_year",
                  "mencia_all_years.csv")

Load data

Load .csv w/ All years of capture records stacked (single entry per birds; not within season recaps for site persistance)

mencia <- read.csv(file =filename,
                skip = 0,
                stringsAsFactors = FALSE,
                colClasses=c("BAND.SUF"="character"))
dim(mencia)
head(mencia)

Basic cleaning

Change all headings to lower case

names(mencia) <- tolower(names(mencia))

Numbers in notes column are beak lengths

Used for sexing. Put into seperate column

mencia$beak <- gsub("[a-zA-Z]","",mencia$notes)
mencia$beak <- as.numeric(mencia$beak)
summary(mencia$beak)

Convert to factors

mencia$band.pre <- factor(mencia$band.pre)
mencia$band.suf <- factor(mencia$band.suf)
mencia$colors <- factor(mencia$colors)
mencia$age <- factor(mencia$age)
mencia$year <- factor(mencia$year)

Check out data

summary(mencia)
summary(factor(mencia$fat),1000)
summary(factor(mencia$sex),1000)
summary(factor(mencia$notes),1000)

Dates

mencia$date2 <- dmy(mencia$date)

Order by date

mencia <- mencia[order(mencia$date2), ]

Sites

Some years sites have "A" label; remove

with(mencia, table(year, site))

mencia$site <- factor(gsub("[ ][A]$","",mencia$site ))

Fix typos

# x vs X
mencia$colors <- factor(gsub("x-","X-",as.character(mencia$colors)))

# stray hyphen
mencia$species <- factor(gsub("AM-KE","AMKE",as.character(mencia$species)))

# unknown sex = NA
mencia$sex <- factor(ifelse(mencia$sex == "U", NA,mencia$sex))

Species in study

Need to check species codes to make sure they are current and no typos.

Some of these are out of date. This will be fixed in the "Scrubbing" script. This current work is done to send the info to Steve Latta for review

  # library(reshape2)
  # library(ggplot2)
  # library(ggpubr)
  # spp.count <- dcast(data = mencia,
  #       formula = species ~ .,
  #       fun.aggregate = length)
  # names(spp.count) <- c("species","tot.captures")
  # 
  # i.order <- order(spp.count$tot.captures,decreasing = T)
  # spp.count$species <- factor(spp.count$species,
  #                             levels = spp.count$species[i.order])
  # 
  # spp.count <- spp.count[i.order,]
  # 
  # #plot
  # ggbarplot(data = spp.count,
  #           x = "species", y = "tot.captures") +
  #   theme(axis.text.x = element_text(angle = 90, hjust = 1))

Save table of species names for sending to S Latta

  #write.csv(spp.count, file = "temp_spp_codes.csv")

Calculate age of sites

Because the study occurs over several years the sites undergo succession and their exact age each year is importnat to consider

site.age.tab <- data.frame(site = levels(mencia$site),
                      site.age.init = c(20,5,2,10))

mencia <- merge(mencia,site.age.tab)

mencia$site.age <- mencia$site.age.init + 
                      mencia$year.num-2003

Create unique ID for each bird

mencia$band <- with(mencia, paste(band.pre,
                                  band.suf,
                                  sep = "-"))

Code focal migrants

These are the species focused on in the original reject MS

focal.mig <- c("OVEN","BAWW","COYE","AMRE","CMWA",
               "BTBW","PAWA","PRAW")
i <- which(mencia$species %in% focal.mig)

mencia$status.focals <- NA
mencia[i, "status.focals"] <- "mig"

Focal residents

These are the species focused on in the original reject MS

focal.res <- c("HLCU" #changes to HILC
,"STOF"
,"RLTH"
,"NOMO"
,"GRWA" # changes to GTGT
,"BANA"
,"BCPT"
,"YFGR"
,"BFGR"
,"GABU")
i <- which(mencia$species %in% focal.res)


mencia[i, "status.focals"] <- "res"

Check for recaps at different ages

Bird could be aged into 2 diff ages classes; did not appear occur. Makes sense b/c most birds not recaptured. Birds captures as HY often die or move to new site

  # x <- dcast(data= mencia[i.unique,],
  #       formula = band + species + site ~ age,
  #       value.var = "age",
  #       fun.aggregate = length)
  # 
  # 
  # timescap <- apply(x[,c("AHY","ASY","HY","SY","NA")],1,sum)
  # max(timescap)

Load spp meta / trait info

spp.meta <- read.csv(here::here("data","spp_list.csv"),stringsAsFactors =F)

Subset columns I want to use

#status
spp.meta$status
spp.meta$status2
spp.meta <- spp.meta[c("spp.code","spp",
                       "status2",
                       "hab1","diet")]
summary(spp.meta)

Change GRWA -> GTGT

Green tailed ground warlber and green tailed ground tanager are the same

Change column name

names(mencia) <- gsub("species","spp.code", names(mencia))
mencia$spp.code <- as.character(mencia$spp.code)
mencia$spp.code[which(mencia$spp.code == "GRWA")] <- "GTGT"
mencia$spp.code[which(mencia$spp.code == "HLCU")] <- "HILC"

MYWA = myrtyle warbler = YRWA yellow rumped warbler NUMA -> SBMU #nutmet manakin = scaly breasted munia WHQD = WFQD

mencia$spp.code[which(mencia$spp.code == "MYWA")] <- "YRWA"
mencia$spp.code[which(mencia$spp.code == "NUMA")] <- "SBMU"
mencia$spp.code[which(mencia$spp.code == "WHQD")] <- "WFQD"

Merge cleaned data w/ species meta data

Merge

names(mencia)
names(spp.meta)
names(mencia2)

mencia2 <- merge(mencia, spp.meta, all = T)

dim(mencia)
dim(mencia2)

mencia2 <-mencia2[-which(is.na(mencia2$site) == TRUE), ]
mencia <- mencia2

Save cleaned data

Save .csv

filename <- "mencia_cleaned.csv"
filename <- here::here("data-raw",
                  "data-raw-mencia",
                  "mencia_all_captures_by_year",
                  filename)

write.csv(mencia, file = filename,row.names = F)



brouwern/DRmencia documentation built on May 6, 2019, 12:24 p.m.