#' Picking one sample per person
#'
#' In the analysis it is sometimes useful to just have one sample per ENROLLID.
#' This is particularily true when we are simply interest in data specific to
#' the indivudal and not the sample. This function reduces a meta file that
#' may contain many SPECID/ENROLLID season pcr_result and reduces it so that just one SPECID
#' exists for each ENROLLID. We choose the best SPECID defined by
#' the sequenced sample
#' the sample that qualified for snv calling
#' the sample with highest titer.
#' If no samples were seqeunced we choose randomly.
#'
#' @param meta_df A data frame containing one row for each SPECID in the data set.
#' @return A data frame with one row for each ENROLLID in the data frame.
#' @examples
#'print(small_meta)
#'only_one(small_meta)
#'
#' @importFrom rlang quo
#' @importFrom dplyr enquo
#' @importFrom dplyr quo_name
#'@export
#'
only_one<-function(meta_df){ # one per person
only_one_helper<-function(x){
# if there is only one sample then we are done
if(nrow(x)==1){
return(x)
}else if(nrow(x)==2){
# There are two samples here with this person and pcr result
# pick the ones that we sequenced
pick<-which(x$sequenced==T)
if(length(pick)==1){
#if there is only one then we're done
return(x[pick,])
}
else if(length(pick)==0){
# we didn't sequence any of these
pick<-1
return(x[pick,]) # take the first one
}
else if(length(pick)==2){
# we sequenced both
# See how many qualified for snv calling
pick<-which(x$snv_qualified==T)
# Either 0 or 2 did
if(length(pick)==0 | length(pick)==2){
# take the one with a higher titer.
pick<-which(x$gc_ul==max(x$gc_ul))
}
return(x[pick,])
}
}else{
stop("more than 2 samples for this person?")
}
}
out <- meta_df %>% dplyr::group_by(ENROLLID,pcr_result,season) %>%
dplyr::do(only_one_helper(.))
return(out)
}
# ==============================================================================
# Transmission pairs
#' Validating transmission pairs
#'
#' Here we take a data frame where each row is a putative transmission pair
#' and determine which ones are valid. Valid pairs may only have the same
#' onset date if they are the index cases of the house. Such cases get a double
#' tag as well. Otherwise we only except pairs that do not jump a middle case,
#' and there can not be 2 possible donors (same onset date) for a recipient.
#' That is thrown out.
#'
#' @param tp A data frame containing one row for each possible transmission pair
#' @return A data frame with one row for each possible transmission pair
#'
#' @export
finding_valid<-function(tp){
# This function will take in a household and determine whether or not each pair
# is it is a valid transmission event - Note we have not subset on those with useful sequence data yet.
# This the index date for the house it is used later
finding_valid_helper<-function(x){
# for each possible recipient in the house we will find which donor is valid
# Get the difference in onset date
x=dplyr::mutate(x,diff_onset=onset2-onset1)
# valid pairs at this point must have a difference on onset date >0.
valid_onsets=x$diff_onset[x$diff_onset>0]
# the recipient is sick on the house index case and onset1==onset2 and there is no other possible donor date - these are all sick on the first day and need to be handeled in both directions
if(identical(x$onset1,x$onset2) & all(x$onset2 == min_onset)){
x$valid<-T
x$double<-T
}
else if(length(valid_onsets)>0){
# There are some valid donors
# If there is only 1 valid donor here with onset date closest to that of recipient (i.e donors not sick on the same day)
if(length(which(x$diff_onset==min(valid_onsets)))==1){
x$valid[x$diff_onset==min(valid_onsets)]=T
}
}
# otherwise we leave them as false
x<-subset(x,select=-c(diff_onset)) # get rid of this column
return(x)
}
min_onset<-min(tp$onset1)
if(nrow(tp)==1){ # there is only one pair here so it's valid
tp$valid=T
# there is only one pair and they are sick on the same day
# so this is valid but we will need to look at transmission both ways
if(tp$onset1==tp$onset2){
tp$double=T
}
return(tp)
}
else{
# There are multiple possible pairs here
# All that is left is to remove the pairs that skip an individual - arc over the top.
# For this we will look at each recipeint and validate the pair that has the minimum diff in onset date
# but whose difference is >0
tp %>% dplyr::group_by(ENROLLID2) %>%
dplyr::do(finding_valid_helper(.))->tp_multi
return(tp_multi)
}
}
#' Finding transmission pairs
#'
#' Here we take a data frame and look for all cases where two individuals are
#' sick within a week of each other. This does not compute wether or not the
#' pairs match our criteria yet
#'
#' @param meta_one A data frame containing one row for each ENROLLID in the data set.
#' @return A data frame with one row for each possible transmission pair
#' @examples
#' one_meta<-only_one(small_meta)
#' getting_tp(one_meta)
#'@importFrom dplyr tibble as_tibble
#'@export
#'
getting_tp<-function(meta_one){ # The data frame is all for 1 house. The only criteria here is that the samples are from the same house, and onset is within a week of eachother. The data.frames are ordered such that onset2 is on the same day or later than onset1
getting_tp_helper<-function(df){
house<-unique(df$HOUSE_ID)
stopifnot(length(house)==1,length(unique(df$pcr_result))==1) # Verify only 1 house here and only 1 strain
if(length(unique(df$ENROLLID))!=length(unique(df$SPECID))) stop("Please supply a data frame with only one entry/ENROLLID")
pairs<-tibble(ENROLLID1=NA,ENROLLID2=NA,onset1=NA,onset2=NA,transmission=NA,
sequenced1=NA,sequenced2=NA,gc_ul1=NA,gc_ul2=NA,
snv_qualified1=NA,snv_qualified2=NA,
valid=NA,double=NA)[F,]
# verify more than 1 person in the household
if(length(unique(df$ENROLLID))>1){
# Order the data frame based on onset date. Earliest is first
df<-df[order(df$onset,df$ENROLLID,decreasing = F),]
# loop through and get all possible transmission pairs and make 1
# row for each possible transmission pair I know this is not tidy but
# that's ok for now
for(i in 1:(nrow(df)-1)){
ENROLLID1=df$ENROLLID[i]
onset1=df$onset[i]
sequenced1=df$sequenced[i]
gc_ul1=df$gc_ul[i]
snv_qualified1=df$snv_qualified[i]
for(j in (i+1):nrow(df)){
ENROLLID2=df$ENROLLID[j]
onset2=df$onset[j]
sequenced2=df$sequenced[j]
gc_ul2=df$gc_ul[j]
snv_qualified2=df$snv_qualified[j]
transmission=onset2-1
if((onset2-onset1)<=7){ #if onset is withing a week of each other
out<-tibble(ENROLLID1=ENROLLID1,ENROLLID2=ENROLLID2,
onset1=onset1,onset2=onset2,
transmission=transmission,
sequenced1=sequenced1,sequenced2=sequenced2,
gc_ul1=gc_ul1,gc_ul2=gc_ul2,
snv_qualified1=snv_qualified1,snv_qualified2=snv_qualified2,
valid=F,double=F,stringsAsFactors=F) # all start as not valid
pairs<-rbind(pairs,out)# We don't want that first line with the NA this just adds the out data frame with the transmission pair if it meets the onset date
}
}
}
pairs<-subset(pairs,!(is.na(onset1)) & !(is.na(onset2))) # There is a sample without this data. we can't use it
if(nrow(pairs)>0){
valid_pairs<-finding_valid(pairs)
valid_pairs<- dplyr::mutate(valid_pairs,# Just adding columns to summarize the pairs.
sequenced_pair=sequenced1==sequenced2 & sequenced2==T,
titer_pair= sequenced1==sequenced2 & sequenced2==T & gc_ul1>1e3 & gc_ul2>1e3,
snv_qualified_pair = snv_qualified1==T & snv_qualified2==T)
return(valid_pairs)
}else{
pairs<-dplyr::mutate(pairs,# Just adding columns to summarize the pairs.
sequenced_pair=NA,
titer_pair= NA,
snv_qualified_pair = NA)
return(pairs[F,])
}
#return(pairs)
} else {
pairs<-dplyr::mutate(pairs,# Just adding columns to summarize the pairs.
sequenced_pair=NA,
titer_pair= NA,
snv_qualified_pair = NA)
return(pairs[F,]) # We need to return a data.frame for each entry. This is empty
}
}
result <- meta_one %>% dplyr::group_by(HOUSE_ID,pcr_result,season) %>%
dplyr::do(getting_tp_helper(.))
return(result)
}
#' Get all ENROLLID that have two SPECID in a season with same strain..
#'
#' This is useful to get the samples we have intrahost data for.
#'
#' @param df A data frame with at least ENROLLID, SPECID, season, and pcr_result. Each row is an isolate.
#' @param samples The number of samples required to keep ENROLLID default = 2.
#' @return the a data frame with only ENROLLID with multiple isolates - each row is an isolate
#' @examples
#' # There one shared iSNV here and one fixed difference the rest are the same.
#' print(small_meta)
#'
#' get_double(small_meta)
#'
#' @export
#'
#'
#'
get_double<-function(df,samples=2){
counts <- df %>%
dplyr::group_by(ENROLLID,pcr_result) %>%
dplyr::summarize(samp = length(unique(SPECID)))
doubles<- counts %>% dplyr::filter(samp==samples) %>%
dplyr::select(-samp)
out <- merge(df,doubles,by =c("ENROLLID","pcr_result"),all.y = T )
return(out)
}
two_one<-function(df,columns,sn){
num<-paste0(columns,as.character(sn))
num_df<-dplyr::select(df,num)
for(i in 1:length(columns)){
names(num_df)[names(num_df)==num[i]]<-columns[i]
}
return(num_df)
}
#' Convert short paired data to long paired data.
#'
#' This function takes in a data frame containing short paired data with one
#' row / pair and outputs one row / ENROLLID in the pair. If the data does not
#' contain a column named 'pair_id' one will be added with a unqiue number for
#' each pair. This is currently groups by pair_id
#'
#' @param df A data frame with obseravtions to be split
#' @param columns columns to be added. They must be equal to names that already exist
#' in every way except lacking either a 1 or a 2 at the end.
#'
#' @return the a data frame with a row for each ENROLLID
#' @examples
#' one_meta<-only_one(small_meta)
#' tp<-getting_tp(one_meta)
#' longform_pairs(tp)
#' @export
#'
#'
#'
longform_pairs<-function(df,columns = c("ENROLLID","onset","gc_ul","sequenced")){
if(!("pair_id" %in% names(df))){
df$pair_id<-1:nrow(df)
}
lp_helper<-function(df,columns){
# we want to add single columns capturing the data held in two columns here.
id<-unique(df$pair_id)
one<-two_one(df,columns,1)
two<-two_one(df,columns,2)
out<-rbind(one,two)
out$pair_id<-id
one_columns<-paste0(columns,"1")
two_columns<-paste0(columns,"2")
extra_col<-c(one_columns,two_columns)
df_other_rows<-dplyr::select(df,-dplyr::one_of(!!extra_col))
out<-merge(out,df_other_rows,by="pair_id")
if(out$onset[1]<out$onset[2]){
out$case<-c("Donor","Recipient")
} else if(out$onset[1]>out$onset[2]){
out$case<-c("Recipient","Donor")
}else{
out$case<-c("Unknown","Unknown")
}
return(out)
}
out <- df %>% dplyr::group_by(pair_id) %>%
dplyr::do(lp_helper(.,columns))
return(out)
}
#' Convert long paired data to short paired data.
#'
#' This function takes in a data frame containing long paired data with one
#' row / sample in pair and outputs one row / pair. It is on the user to supply
#' grouping columns such that only 2 samples will match the groups. (we don't
#' handled more than 2 cases currently). If there are any columns that contain
#' data not shared by the sample pair they must be included or removed prior to using
#' the function
#'
#' @param df A data frame with obseravtions to be combined
#' @param columns columns to be used in combination. They must exist and new columns
#' will be made appending 1 and 2 to these names
#' @param ... unquoted grouping columns
#' @return the a data frame with a row for each pair
#' @examples
#' get_double(small_meta)->small_doub
#' small_doub<-dplyr::select(small_doub,ENROLLID,pcr_result,HOUSE_ID,SPECID,
#' onset,collect,vaccination_status,DPI,season,gc_ul,sequenced,home_collected,snv_qualified)
#' columns = c("SPECID","onset","collect","vaccination_status",
#' "DPI","gc_ul","sequenced","home_collected","snv_qualified")
#' short_pairs(small_doub,columns,ENROLLID,HOUSE_ID,pcr_result,season)
#' @export
short_pairs<-function(df,columns, ... ){
short_helper<-function(x){
x<-x[order(x$collect,-x$home_collected,decreasing = F),] # This puts the home sample first if they are taken on the same day.
for(col in columns){
for(i in 1:2){
c<-paste0(col,i)
x[c] <- x[i,col]
}
}
x<-dplyr::select(x,-dplyr::one_of(!!columns))
# we wil remove an unneeded row we want to make sure we aren't removing data.
if(any(x[1,]!=x[2,])) stop("One or more columns were not the same.\n remove them.")
return(x[1,])
}
group_var= dplyr::quos( ... )
out<- df %>% dplyr::group_by(!!!group_var) %>%
dplyr::do(short_helper(.))
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.