R/filteringGenes.R
In qfa: Tools for Quantitative Fitness Analysis (QFA) of Arrayed Microbial Cultures Growing on Solid Agar Surfaces

Documented in checkTarg getMissingGenotypes getMissingSGA getNeighbours readMer readSGD

readSGD=function(fname="http://downloads.yeastgenome.org/curation/chromosomal_feature/SGD_features.tab"){
	# Read in SGD features tab file and give columns appropriate headers
	sgdNames=c("SGDID1","FType","FQual","FName","Gene","Alias","Parent","SGDID2","Chr","Start","Stop","Strand","Pos","CoordVersion","SeqVersion","Description")
	# Note I had to strip a stray " from line 5542 to get whole file to read in, or read with quote=""
	sgd=read.delim(fname,header=FALSE,stringsAsFactors=FALSE,col.names=sgdNames,sep="\t",quote="")
	sgd$Mid=(sgd$Start+sgd$Stop)/2
	return(sgd)
}

checkTarg=function(targ,sgd){
	targ=toupper(targ)
	# If targ is not a y-number, try standard gene name or alias
	if(!targ%in%sgd$FName){
		tfinal="MISSING"
		if(targ%in%sgd$Gene){
			tfinal=sgd$FName[sgd$Gene==targ]
		}
		if(targ%in%sgd$Alias){
			tfinal=sgd$FName[sgd$Alias==targ]
		}
		return(tfinal)
	}else{
		return(targ)
	}
}

getNeighbours=function(gvec,kb,sgd,geneCoord="Mid",geneDist="Mid",geneDir="All"){
	# Can specify geneCoord = "Start", "Mid" or "Stop" for the definition of the position of target genes
	# Can specity geneDist = "Start", "Stop", "Mid" or "Min" (smallest of all alternatives) for definition of distance between genes
	# Can specify geneDir = "All", "Up" or "Down"
	geneDist=paste("d",geneDist,sep="")
	gvec=sapply(gvec,checkTarg,sgd)
	res=head(sgd,0)
	res=data.frame()
	
	for(g in gvec){
		targ=sgd[sgd$FName==g,]
		sgdChr=sgd[(sgd$Chr==targ$Chr),]
		sgdChr$dStart=sgdChr$Start-targ[[geneCoord]]
		sgdChr$dStop=sgdChr$Stop-targ[[geneCoord]]
		sgdChr$dMid=sgdChr$Mid-targ[[geneCoord]]
		sgdChr$dMin=sign(sgdChr$dMid)*pmin(abs(sgdChr$dStart),abs(sgdChr$dStop))
		sgdChr$All=abs(sgdChr[[geneDist]])
		sgdChr$Up=if(targ$Stop>targ$Start){sgdChr[[geneDist]]}else{-sgdChr[[geneDist]]}
		sgdChr$Down=if(targ$Stop>targ$Start){-sgdChr[[geneDist]]}else{sgdChr[[geneDist]]}
		near=sgdChr[(sgdChr[[geneDir]]>=0)&(sgdChr[[geneDir]]<kb*1000)&(sgdChr$FType=="ORF"),]
		near$Target_ORF=g
		near$Target_Gene=targ$Gene
		res=rbind(res,near)
	}
	return(res)
}

## DEMO Stripping genes found nearby other genes
# Parse SGD features.tab
# Find chromosome
# Find all genes on chromosome within cutoff kb of target
# Return Y-numbers and gene names
#sgd=readSGD("CommonAuxiliary/SGD_features.tab")
#neighbs=getNeighbours(c("cdc13","lyp1","yku70","ura3"),5,sgd)
#print(neighbs$Gene)
#print(neighbs$FName)

readMer=function(fname="CommonAuxiliary/QFA_Database.mer"){
	mer=read.delim(fname,sep=",",stringsAsFactors=FALSE)
	# Need to tidy up SGA Number column
	mer$SGA.Number=as.numeric(gsub("SGA","",mer$SGA.Number))
	return(mer)
}

getMissingSGA=function(SGAs,threshfrac){
	SGAs=sprintf("SGA%04d",SGAs)
	findFirst=function(SGA,threshfrac){
		# Want YPD_G fitness report, but can't rely on client or temperature (also in file name)
		SGAfiles=list.files(file.path("SGA_EXPERIMENTS",SGA,"ANALYSISOUT"))
		# Want only fitness reports
		SGAfiles=SGAfiles[grepl("FitnessReport",SGAfiles)]
		if(length(SGAfiles)>1){
			# Sometimes YPD_G/YPD_GN, sort and choose lowest
			SGAfile=sort(SGAfiles[grepl("YPD_G",SGAfiles)])[1]
		}else{
			SGAfile=SGAfiles
		}
		print(paste("Identifying dead strains in",SGAfile))
		SGApath=file.path("SGA_EXPERIMENTS",SGA,"ANALYSISOUT",SGAfile)
		f=file(SGApath, "r", blocking = FALSE)
		flines=readLines(f)
		close(f)
		delimiter=which(grepl("#######",flines))[1]
		SGAfit=read.delim(SGApath,sep="\t",stringsAsFactors=FALSE,skip=delimiter)
		SGAfit$MedianScaled=SGAfit$MedianFit/median(SGAfit$MedianFit)
		SGAfit$MeanScaled=SGAfit$MeanFit/mean(SGAfit$MedianFit)
		SGAfit$SGA=SGA
		return(SGAfit[SGAfit$MedianScaled<=threshfrac,])
	}
	reptList=lapply(SGAs,findFirst,threshfrac)
	combined=do.call(rbind,reptList)
	res=combined[!duplicated(combined$ORF),]
	return(res)
}

getMissingGenotypes=function(QFAs,threshfrac,mer){
	QFAnums=as.numeric(substr(QFAs,nchar(QFAs)-3,nchar(QFAs)))
	SGAnums=mer$SGA.Number[mer$Screen.Number%in%QFAnums]
	SGAnums=SGAnums[!is.na(SGAnums)]
	if(length(SGAnums)==0) {
		print("Warning!  No SGAs found in database when searching for dead cultures")
		return(NULL)
	}
	print(paste("Identifying cultures missing in",sprintf("SGA%04d",SGAnums)))
	res=getMissingSGA(SGAnums,threshfrac)
	return(res)
}

## DEMO Stripping genes that were observed to be dead in starting libraries
# Parse .mer file
# Find SGA screen corresponding to QFA
# Find strains whose median fitness lie below a threshold (what about genotypes with many replicate strains?)
# Alternatively find pinIDs whose median fitnesses lie below threshold...
#mer=readMer("CommonAuxiliary/QFA_Database.mer")
#dead=getMissingGenotypes(c(136,60),0.25,mer)
#print(dead$ORF)
#print(dead$Gene)

Any scripts or data that you put into this service are public.

qfa documentation built on Feb. 22, 2020, 3:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

qfa
Tools for Quantitative Fitness Analysis (QFA) of Arrayed Microbial Cultures Growing on Solid Agar Surfaces

R/filteringGenes.R
In qfa: Tools for Quantitative Fitness Analysis (QFA) of Arrayed Microbial Cultures Growing on Solid Agar Surfaces

Defines functions readSGD checkTarg getNeighbours readMer getMissingSGA getMissingGenotypes

Documented in checkTarg getMissingGenotypes getMissingSGA getNeighbours readMer readSGD

Try the qfa package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

qfa Tools for Quantitative Fitness Analysis (QFA) of Arrayed Microbial Cultures Growing on Solid Agar Surfaces

R/filteringGenes.R In qfa: Tools for Quantitative Fitness Analysis (QFA) of Arrayed Microbial Cultures Growing on Solid Agar Surfaces

Defines functions readSGD checkTarg getNeighbours readMer getMissingSGA getMissingGenotypes

Documented in checkTarg getMissingGenotypes getMissingSGA getNeighbours readMer readSGD

Try the qfa package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

qfa
Tools for Quantitative Fitness Analysis (QFA) of Arrayed Microbial Cultures Growing on Solid Agar Surfaces

R/filteringGenes.R
In qfa: Tools for Quantitative Fitness Analysis (QFA) of Arrayed Microbial Cultures Growing on Solid Agar Surfaces