#' @title getStructuredData
#' @description A function for structuring data
#' @details The function must return in a specific format.
#' @param data a list with a evidence table (mix) and a reference table (ref)
#' @param ln markers to consider (a given order)
#' @param minLoc is minimum number of overlapping markers to be identical (for partial profiles).
#' @export
getStructuredData = function(data,ln,minLoc=10) {
#data is a list with a evidence table (mix) and a reference table (ref):
#ln the markers to consider (a given order)
#popFreq is a list with allele freqs popFreq[[locus]]. NB: Must contain same loci as in ln
#minLoc is minimum number of overlapping markers to be identical (for partial profiles).
#Evidences: colnames(data$mix)=("SampleName","Markers","Alleles","Heights"). Separated with "/" in text format.
#References: colnames(data$ref)=("SampleName","Markers","Alleles"). Separated with "/" in text format.
if(nrow(data$mix)>0) data$mix[,2] <- toupper(data$mix[,2]) #LOCUS-names are assigned as Upper-case! This is important to do!
if(nrow(data$ref)>0) data$ref[,2] <- toupper(data$ref[,2]) #LOCUS-names are assigned as Upper-case! This is important to do!
sortGenotypes = function(tmp) {
suppressWarnings({
v1 <- as.numeric(tmp[,1])
v2 <- as.numeric(tmp[,2])
})
isNA <- is.na(v1) | is.na(v2)
swap <- tmp[,2]<tmp[,1] #sort alleles by text
swap[!isNA] <- v2[!isNA]<v1[!isNA] #sort alleles by number
tmp[swap,1:2] <- tmp[swap,2:1]
return(tmp)
}
#Fix ref-alleles: Insert single alleles, order alleles in ref by increasing order
if(nrow(data$ref)>0) {
isempty <- data$ref[,3]==""
impute <- !isempty & !grepl("/",data$ref[,3]) #have only one allele. Must get correct format: 2 alleles
data$ref[impute,3] <- paste0(data$ref[impute,3],"/",data$ref[impute,3]) #include twice
ord <- !isempty & !impute
tmp <- unlist( strsplit( data$ref[ord,3],"/") )
if(length(tmp)!=2*sum(ord)) { #don't allow more than two alleles
stop("The number of alleles for some of your references was greater than two! Please make sure not to exceed two.")
}
tmp <- t(matrix(tmp ,nrow=2 )) #get genotypes for each markers
tmp <- sortGenotypes(tmp) #get sorted genotypes
data$ref[ord,3] <- paste0(tmp[,1],"/",tmp[,2]) #make sure it get correct order
}
#FIRST: DUPLICATED REF-PROFILES WILL BE REMOVED, GIVEN A NOTIFICATION AS MESSAGE
#SECOND: REF-PROFILES EQUAL EVID-PROFILES (SINGLE) WILL BE DETECTED, WILL BE NOTIFIED BEH
#THIRD: SINGLE EVID-PROFILES WILL BE ASSIGNED AS AN UNKNOWN (1,2,3 etc)
refNames = evidNames = NULL
if(nrow(data$ref)>0) refNames <- unique(data$ref[,1]) #vector of unique ref-samples
if(nrow(data$mix)>0) evidNames <- unique(data$mix[,1]) #vector of unique evid-samples
stringRef <- rep("",length(refNames))
stringEvid <- rep("",length(evidNames))
nClow <- rep(0,length(evidNames)) #used to get lower boundary of number of contributors in evidence profiles
for(loc in ln) { #go through each loci and create a string per sample profiles: Used to detect similarity
#loc = ln[1]
subRef <- data$ref[data$ref[,2]==loc,,drop=F]
subRef <- subRef[match(refNames,subRef[,1]),,drop=F] #get correct order of data (NECESSARY IF IMPORT NOT CONSISTENT)
subEvid <- data$mix[data$mix[,2]==loc,,drop=F]
subEvid <- subEvid[match(evidNames ,subEvid[,1]),,drop=F] #get correct order of data (NECESSARY IF IMPORT NOT CONSISTENT)
subEvid[is.na(subEvid[,3]),3] <- "NA" #add string NA
subEvid[subEvid[,3]=="",3] <- "NA" #add string NA
subRef[is.na(subRef[,3]),3] <- "NA" #add string NA
subRef[subRef[,3]=="",3] <- "NA" #add string NA
#add loci separation sign before adding alleles to vector
if(which(loc==ln)>1) {
if(length(refNames)>0) stringRef <- paste0(stringRef,"-")
if(length(evidNames)>0) stringEvid <- paste0(stringEvid,"-") #add loci separation sign
}
stringRef <- paste0(stringRef, subRef[,3]) #the alleles of ref is inserted (notice the order)
stringEvid <- paste0(stringEvid, subEvid[,3]) #the alleles of ref is inserted (notice the order)
if(length(evidNames)>0) { #require at least one evidence
nCtmp <- ceiling(sapply(strsplit(subEvid[,3],"/"),length)/2)
nCtmp[subEvid[,3]=="NA"] <- 0 #set to zero if na
ind <- nCtmp>nClow #evids to update
nClow[ind] <- nCtmp[ind]
}
} #end for each loci
#Fix single source strings:
evidNamesSS <- evidNames[nClow==1] #consider only single source profiles
stringEvidSS <- stringEvid[nClow==1] #consider only single source profiles
#Fix stringEvid2: Impute homozygous alleles, and order the alleles:
if(length(evidNamesSS)>0) { #require at least 1 sample
stringEvidSS2 <- t(matrix( unlist(strsplit(stringEvidSS,"-")),ncol=length(stringEvidSS) )) #get genotypes for each markers
isempty <- stringEvidSS2=="NA"
impute <- !isempty & !grepl("/",stringEvidSS2) #have only one allele. Must get correct format
stringEvidSS2[impute] <- paste0(stringEvidSS2[impute],"/",stringEvidSS2[impute]) #include alleles twice
ord <- !isempty & !impute #markers with heteroyzogous variants
tmp <- unlist( strsplit( stringEvidSS2[ord],"/") )
tmp <- t(matrix(tmp ,nrow=2 )) #get genotypes for each markers
tmp <- sortGenotypes(tmp) #get sorted genotypes for evidence
stringEvidSS2[ord] <- paste0(tmp[,1],"/",tmp[,2]) #
#update the evid-strings stringEvidSS and stringEvid again:
stringEvidSS <- numeric() #restart this
for(loc in ln) { #go through each loci and create a string per sample profiles: Used to detect similarity
ind <- which(loc==ln)
if(ind>1) {
stringEvidSS <- paste0(stringEvidSS,"-") #add loci separation sign
}
stringEvidSS<- paste0(stringEvidSS, stringEvidSS2[,ind]) #the alleles of ref is inserted (notice the order)
}
stringEvid[nClow==1] <- stringEvidSS #update evid-string
} #end for SS samples
#DETECT SIMILAR REFERENCES:
#evidMatch is a vector with identical=match info ("mixture","refname","unknown X")
evidMatch <- rep("mixture",length(evidNames))
if(length(refNames)>0) {#IF ANY REFERENCES
refU <- unclass(factor(stringRef)) #get similar refs (each number is a unique reference)
isdup <- duplicated(refU) #which are duplicates?
refDups <- refNames[isdup] #these refs are duplicates
data$ref <- data$ref[!data$ref[,1]%in%refDups,] #update refs such that non-duplicates are keeped
if( length(refDups)>0 ) {
dups <- refU[isdup] #these are duplicate grps
dupsUn <- unique(dups) #get unique duplicate grps
for(gg in dupsUn) {
print(paste0("Equal references detected: ", paste0(refNames[refU==gg],collapse="="), ". Keeping first."))
}
}
refNames <- na.omit(refNames[!isdup]) #unique refs: refnames
stringRef <- stringRef[!isdup] #unique refs: refalleles
stringRef2 <- t(matrix( unlist(strsplit(stringRef,"-")),ncol=length(stringRef) )) #get genotypes for each markers
nR = length(stringRef) #get references
#DETECT SIMILAR EVIDENCE AND REF - PROFILES:
#SHOULD BE SIMILAR TO EFFICIENT MAC COMPARISON
if(length(evidNamesSS)>0) { #require at least 1 sample
unStringEvidSS <- unique(stringEvidSS) #get unique SS-string
stringEvidSS2 <- unique(stringEvidSS2) #get corresponding uniques
nS = length(unStringEvidSS) #get unique SS samples
print(paste0("Comparing references against SingleSource samples: ",nS*nR," comparisons."))
matchvec <- rep(NA,nS) #create a vector for finding matches
for(ss in 1:nS) { #for each sample: We calculate pairwise comparison
#ss=1
macS <- missmatch <- nLocs <- rep(0,nR) #make vector for all references
for(loc in ln) { #for each locus: Vectorizing as much as possible!
ll=which(loc==ln)
if(is.na(stringEvidSS2[ss,ll]) || stringEvidSS2[ss,ll]=="NA") next #skip if no data
sttmp <- unlist(strsplit(stringEvidSS2[ss,ll],"/")) #get alleles in single source profile (2 ordered alleles )
isna <- is.na(stringRef2[,ll]) | stringRef2[,ll]=="NA" #get which refs are non-zero. Not counted if missing!
if(all(isna)) next #skip if no references
AvecList <- t(matrix(unlist(strsplit(stringRef2[!isna,ll],"/")),nrow=2)) #get alleles of refs (in case of 2)
#Used for exact criterion
missmatch[!isna] = missmatch[!isna] + as.integer(AvecList[,1]!=sttmp[1]) + as.integer(AvecList[,2]!=sttmp[2])
#Used for inclusion criterion
numWithin <- rep(0,sum(!isna)) #number of unique alleles of reference that are in stain
for(aa in unique(sttmp)) numWithin <- numWithin + rowSums(AvecList==aa) #sum up for each alleles in stain
if( sttmp[1]==sttmp[2] ) { #if SS had homozygout genotype
ind <- AvecList[,1]!=AvecList[,2] & numWithin==2 #those not same but got MAC=2
numWithin[ind] <- numWithin[ind] - 1 #subtract 1 because these were not homozygous
}
macS[!isna] <- macS[!isna] + numWithin #add number of matching alleles
nLocs[!isna] <- nLocs[!isna] + 1 #add locus
} #end for each locus
#get number of missmatches based on inclusion criterion
missmatch2 = 2*nLocs - macS
foundind = which(missmatch==0 & nLocs>=minLoc ) #index with zero mismatches AND EVALUATED AT LEAST minLoc
foundind2 = which(missmatch2==0 & nLocs>=minLoc ) #index with zero mismatches AND EVALUATED AT LEAST minLoc
indEvids = unStringEvidSS[ss]==stringEvid #index of evidence that is considered
if(length(foundind)>0) { #if found an exact match
evidMatch[indEvids] = refNames[ foundind[which.max(nLocs[foundind])] ]
if(length(foundind)>1) print("Several references matched (SIMILAR REFERENCES). Keeping the one with most loci!")
} else if( length(foundind2)>0 ) { #if found an inclusion match
txt = paste0("NOTE: ",refNames[ foundind2[which.max(nLocs[foundind2])] ]," satisfies inclusion in ",paste0(evidNames[indEvids],collapse="/") )
print(txt)
}
#else if no match found
} #end for each SS sample
rm(AvecList ,numWithin,missmatch ) ; gc()
} #end if having SS samples
} #end IF ANY REFS
#sort structures: Hierarchical differences
evidord <- order(nchar(stringEvid)) #sort evid with respect to number of characters
stringEvid <- stringEvid[evidord] #update order
evidNames <- evidNames[evidord] #update order
evidMatch <- evidMatch[evidord]
names(evidMatch) <- evidNames #get vector of match status for all evidence profiles
nClow <- nClow[evidord] #update order
#CREATE STRUCTURE: data-frame tables + lists
dfmix <- matrix(ncol=length(ln),nrow=length(evidNames))
dfref <- matrix(ncol=length(ln),nrow=length(refNames ))
colnames(dfref) <- colnames(dfmix) <- ln
rownames(dfmix) <- evidNames
rownames(dfref) <- refNames
mixL <- list() #refL is not considered anymore
#Structure Evidences into list and matrix
for(mix in evidNames) { #for each evid profile: CAN BE TIMECONSUMING FOR MANY EVIDS!
#mix = evidNames[1]
mixL[[mix]] = list()
subEvid <- data$mix[ data$mix[,1]==mix,,drop=F] #get evid data
subEvid <- subEvid[match(ln,subEvid[,2]),,drop=F] #get correct order of loci (NECESSARY IF IMPORT NOT CONSISTENT)
subEvid[is.na(subEvid[,3]),3:4] <- "" #add strings "" if NA
tryCatch({
dfmix[which(evidNames==mix),] = subEvid[,3] #extract import data
}, error = function(e) print( paste0("There was an inconsistent number of loci observations for evidence ",mix)) )
adata = strsplit(subEvid[,3],"/")
hdata = strsplit(subEvid[,4],"/")
for(loc in ln) { #loop for each loci
locind = which(loc==ln)
if(length(adata[[locind]])!=length(hdata[[locind]])) {
outtxt <- paste0("!WARNING! ",mix,"-",loc,": The number of alleles was not same as the number of peaks")
print(outtxt)
}
mixL[[mix]][[loc]] = list(adata=adata[[locind]],hdata=as.numeric(hdata[[locind]]))
}
}
#helpfunction of whether a locus is a Y-marker
isY = function(x) return( toupper(substr(x,0,2))=="DY" || toupper(substr(x,0,1))=="Y" )
#Structure References into matrix
for(loc in ln) { #
#loc=ln[26]
subRef <- data$ref[data$ref[,2]==loc,,drop=F]
subRef <- subRef[match(refNames,subRef[,1]),,drop=F] #get correct order of data (NECESSARY IF IMPORT NOT CONSISTENT)
subRef[is.na(subRef[,3]),3] <- "" #add string "" if NA
#HANDLING Y-markers for references:
if( isY(loc) ) { #check if Y-marker
consInd = which(subRef[,3]!="") #indices to consider
if(length(consInd)>0) {
tmp = matrix(unlist(strsplit(subRef[consInd,3],"/")),nrow=2) #get all genotypes
changeInd = tmp[1,]==tmp[2,] #homozygous variants are changed
if(any(consInd)) subRef[consInd[changeInd],3] <- tmp[1,changeInd] #insert only 1 allele variant
}
} #end if Y-marker
tryCatch({
dfref[,which(ln==loc)] = subRef[,3] #extract import data
}, error = function(e) print( paste0("There was an inconsistent number of reference observations at locus ",loc)) )
}
evidMatch[evidMatch=="mixture" & nClow==0] <- "empty" #set empty if no data
#evidMatch[nClow==1] <- "mixture" #reset to test next block
#CONCERNING UNKNOWN PROFILES DEDUCED FROM SINGLE SOURCE PROFILES
#Assign unknowns for the rest:
isUnknown <- evidMatch=="mixture" & nClow==1 #potential unknowns
if(!is.na(isUnknown[1]) && any(isUnknown)) { #continue only if any potential unknowns
#Need to check if profiles may be same unknowns (because of NA):
#Must compare between all samples.
stringEvidSSun <- unique(stringEvid[isUnknown]) #get uniques strings
stringEvidSSun2 <- matrix(unlist(strsplit(stringEvidSSun,"-")),ncol=length(stringEvidSSun)) #get uniques strings
nS <- ncol(stringEvidSSun2) #number of unknowns
if(nS>1) { #needs at least 2 unknowns
matCC <- matrix(ncol=3,nrow=nS*(nS-1)/2) #a list of match connections
cc <- 1 #counter
#FOLLOWING PROCEDURE CAN BE VERY TIME CONSUMING FOR LARGE NUMBER OF UNKNOWNS
for(evid1 in 1:(nS-1)) { #for each unique evidence:
for(evid2 in (evid1+1):nS) { #for each unique evidence
evid1val <- stringEvidSSun2[,evid1]
evid2val <- stringEvidSSun2[,evid2]
evid1val[evid2val=="NA"] <- "NA" #allow missing genotypes in evid1
evid2val[evid1val=="NA"] <- "NA" #allow missing genotypes in evid2
nLocs <- sum(evid1val!="NA") #number of non-empty loci
match <- all(evid1val==evid2val) & nLocs>=minLoc #must match + have sufficiently number of loci
matCC[cc,] <- c(evid1,evid2,match)
cc <- cc + 1
}
}
}
#Label unknown profiles and create a data matrix (reflist not considered anymore)
unknownInd <- rep(NA,nS)
refU <- matrix(nrow=nS,ncol=length(ln)) #unknown data matrix
colnames(refU) = ln
for(evid1 in 1:nS) { #traverse through each evidence
matchevid <- numeric()
if(nS>1) {
subind <- matCC[,1]==evid1 | matCC[,2]==evid1
subCC <- matCC[subind,,drop=FALSE]
matchevid <- subCC[subCC[,3]==1,2] #find potential other matching evidence
}
unk <- length(na.omit(unique(unknownInd)))+1 #unknown tag
if(is.na(unknownInd[evid1])) {
unknownInd[evid1] <- unk
unknownInd[matchevid] <- unknownInd[evid1] #copy name to matching
#Create concensus of unknown profile:
for(loc in ln) {
locind = which(loc==ln)
ins <- stringEvidSSun2[locind,c(evid1,matchevid)]
av <- ins[ins!="NA"][1] #unlist(strsplit(ins[ins!="NA"][1],"/")) #get alleles (should be able to edit this)
if(!is.na(av[1]) && isY(loc) ) { #check if Y-marker
av2 = unlist(strsplit(av,"/"))
if(av2[1]==av2[2]) av = av2[1] #given as one allele if marker name starts if Y-marker
}
refU[unk,locind] <- av
} #end for each marker
} #end if new unknown
} #end for each evidence
nU = max(unknownInd)
unknname <- paste0("Unknown ",1:nU)
refU = refU[1:nU,,drop=FALSE] #keep only data table with inserted unknowns
rownames(refU) = unknname
#NOTE: CAN SOME refs may be IDENTICAL because loci-information is filled in?
if(any(duplicated(refU))) {
print("NOTE: Some unknown references are redundant. This is not taken into account!")
}
#Add unknowns to evidMatch:
evidUnUn <- unknname[unknownInd] #name of the unknowns belonging to SSevids
for(str in stringEvidSSun) { #for each unique string
ind <- which(stringEvid==str) #get index of equal strings
evidMatch[ind] <- evidUnUn[which(str==stringEvidSSun)] #insert unknown name
}
#Add unknowns (in refList) to refDataTABLE
dfref <- rbind(dfref,refU) #add to other references
} #End if any unknowns
return( list(mixDataTABLE=dfmix, refDataTABLE=dfref,mixDataLIST=mixL,mixDataMATCHSTATUS=evidMatch))
} #end function getStructuredData
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.