R/longskat-file.r

get_large_file_lines<-function(file.large)
{
	testcon <- file( file.large,open="r");
	readsizeof <- 200000
	nooflines <- 0
	( while((linesread <- length(readLines(testcon,readsizeof))) > 0 ) 
	nooflines <- nooflines+linesread )
	close(testcon)
	nooflines
}


check_plink_file<-function( file.plink.bed, file.plink.bim, file.plink.fam )
{
	cat("Checking PLINK file......\n");
	cat("* BED file =", file.plink.bed, "\n");
	cat("* BIM file =", file.plink.bim, "\n");
	cat("* FAM file =", file.plink.fam, "\n");

	#snp.mat <- try( read.plink( file.plink.bed,  file.plink.bim, file.plink.fam) );
	#if(class(snp.mat)=="try-error")
	#{
	#	return(list(bSuccess=F));
	#}
	#n.idv <- NROW(snp.mat$fam)
	#n.snp <- NCOL(snp.mat$genotypes)

	tb.fam <- read.table(file.plink.fam, header=F);
	n.idv <- NROW(tb.fam);
	n.snp <- get_large_file_lines(file.plink.bim);

	cat("* Individuals =", n.idv, "SNPs=", n.snp, "\n")
	cat("* PLINK loading successfully.\n")

	return(list(bSuccess=T));
}

# Phenotype File( CSV Format, Header=T  Comma seprated )
#
# Format 1:
# shareid, PHE1, PHE2, ...
#
# Format 3:
# shareid, TIME1, TIME2, ...

check_pheno_file<-function( file.phe.long, file.phe.time, file.plink.fam )
{
	cat("Checking phenotype file......\n");
	cat("* PHE.LONG =", file.phe.long , "\n");

	phe.long <- try( read.csv(file.phe.long, header=T, stringsAsFactors=F, row.names=1) );
	if (class(phe.long)=="try-error")
	{
		cat("! Can not open file(", file.phe.long, ")\n");
		return(list(bSuccess=F));
	}
	else{
		cat("* Individuals =", NROW(phe.long), "\n");
		cat("* Times =", NCOL(phe.long), "\n");
		cat("* Mean =",  colMeans(phe.long, na.rm=T), "\n");
		cat("* SD =",    colSds(phe.long, na.rm=T),"\n");

		cat("  First 5 Items:\n");
		show(head(phe.long, n=5));
	}

	cat("* PHE.TIME =", file.phe.time , "\n");
	phe.time <- NULL;
	if(!is.null(file.phe.time))
	{
		phe.time <- try( read.csv(file.phe.time, header=T, stringsAsFactors=F, row.names=1) );
		if (class(phe.time)=="try-error")
		{
			cat("! Can not open file(", file.phe.time, ")\n");
			return(list(bSuccess=F));
		}
		else
		{
			cat("* Individuals =", NROW(phe.time), "\n");
			cat("* Times =", NCOL(phe.time), "\n");
			cat("* Mean =",  colMeans(phe.time, na.rm=T), "\n");
			cat("* SD =",    colSds(phe.time, na.rm=T),"\n");
			cat("  First 5 Items:\n");
			show(head(phe.time, n=5));
		}

		idx.inter <- intersect( rownames(phe.long), rownames(phe.time) );
		if( !( length(idx.inter)==NROW(phe.long) && length(idx.inter)==NROW(phe.time) ) )
		{
			cat("! PHE.LONG doesn't have consistent IDs with PHE.TIME.\n" );
			return(list(bSuccess=F));
		}
	}

	cat("* PLINK.FAM =", file.plink.fam , "\n");
	phe.fam <- try( read.table(file.plink.fam, header=F, stringsAsFactors=F) );
	if (class(phe.fam)=="try-error")
	{
		cat("! Can not open file(", file.plink.fam, ")\n");
		return(list(bSuccess=F));
	}
	else{
		cat("  First 5 Items:\n");
		show(head(phe.fam, n=5));
	}

	m.phe  <- match( rownames(phe.long), as.character(phe.fam$V2)  );
	if (length(which(is.na(m.phe))) > 0 )
	{
		cat("!", length(which(is.na(m.phe))), "IDs can not be found in the PLINK file.\n" );
		cat("! First 5 IDs:\n");
		show( head(rownames(phe.long[ which(is.na(m.phe)), ]), n=5) );
	}

	m.snp <- match( as.character( phe.fam$V2) , rownames(phe.long) );
	if (length(which(is.na(m.snp))) > 0 )
	{
		cat("!", length(which(is.na(m.snp))), " IDs can not be found in the phenotype file.\n" );
		cat("! First 5 IDs:\n");
		show( head( phe.fam$V2[ which(is.na(m.snp))], n=5) );
	}

	all.na <- which( is.na( rowSums(phe.long, na.rm=T)	) )
	if (length(all.na)>0 )
	{
		cat("!", length(all.na), "IDs dont have non-missing data.\n" );
		cat("! First 5 IDs:\n");
		show( head( phe.long[all.na, ], n=5) );
	}

	return(list(bSuccess=T))
}

# Covariate File( CSV Format, Header=T  Comma seprated  )
# Format:
#
# shareid, COV1,....

check_covariate_file<-function( file.phe.cov, file.plink.fam  )
{
	cat("Checking covariate file......\n");

	cat("* COV.FILE =", file.phe.cov , "\n");
	phe.cov <- try( read.csv(file.phe.cov, header=T, stringsAsFactors=F, row.names=1) );
	if (class(phe.cov)=="try-error")
	{
		cat("! Can not open file(", file.phe.cov, ")\n");
		return(list(bSuccess=F));
	}
	else{
		cat("* Individuals =", NROW(phe.cov), "\n");
		cat("* Covariate =", NCOL(phe.cov), "\n");
		cat("* Mean =",  colMeans(phe.cov, na.rm=T), "\n");
		cat("* SD =",    colSds(phe.cov, na.rm=T), "\n");
		cat("  First 5 Items:\n");
		show(head(phe.cov, n=5));
	}

	y.ncov <- NCOL(phe.cov) - 1;
	if( NCOL(phe.cov) < 1 + y.ncov)
	{
		cat("! Insufficient covariates in the covariate file, ", NCOL(phe.cov), "<", 1 + y.ncov, ".\n" );
		return(list(bSuccess=F));
	}

	phe.fam <- try( read.table(file.plink.fam, header=F, stringsAsFactors=F) );
	if (class(phe.fam)=="try-error")
	{
		cat("! Can not open file(", file.plink.fam, ")\n");
		return(list(bSuccess=F));
	}

	m.phe  <- match( rownames(phe.cov), as.character(phe.fam$V2)  );
	if (length(which(is.na(m.phe))) > 0 )
	{
		cat("!", length(which(is.na(m.phe))), "IDs, can not be found in the PLINK file.\n" );
		cat("! First 5 IDs:\n");
		show( head(phe.cov[is.na(m.phe), ], n=5 ) );
	}

	m.snp <- match( as.character(phe.fam$V2) , rownames(phe.cov) );
	if (length(which(is.na(m.snp))) > 0 )
	{
		cat("!", length(which(is.na(m.snp))), "IDs, can not be found in the phenotype file.\n" );
		cat("! First 5 IDs:\n");
		show( head(phe.fam[is.na(m.snp), ], n=5 ) );
	}

	all.na <- which( is.na( rowSums(phe.cov, na.rm=T)	) )
	if (length(all.na)>0 )
	{
		cat("!", length(all.na), "IDs dont have non-missing data.\n" );
		cat("! First 5 IDs:\n");
		show( head( phe.cov[all.na, ], n=5) );
	}

	return(list(bSuccess=T))
}

# GENE SET File( Header=F, 2 columns, Space seprated )
# Format:
#
# snpname, gene.name

check_geneset_file<-function( file.gene.set )
{
	cat("Checking gene definition file......\n");
	cat("* GEN.SET.FILE =", file.gene.set , "\n");

	tb <- try( read.table(file.gene.set, sep=" ", header=F, stringsAsFactors=F) );
	if(class(tb)=="try-error")
		return(list(bSuccess=F));

	genes <- unique(tb[,1]);

	cat("* GENEs =", length(genes), "\n");
	cat("* SNPs =", NROW(tb), "\n");

	return(list(bSuccess=T))
}


read_phe_cov<-function( file.phe.long, file.phe.time, file.phe.cov, PF.gen)
{
	phe.long <- read.csv(file.phe.long, header=T, stringsAsFactors=F, row.names=1);
	cat("  PHE LONG =", file.phe.long, "\n");
	cat("* Individuals =", NROW(phe.long), "\n");
	cat("* Times =", NCOL(phe.long), "\n");
	cat("* Mean =",  colMeans(phe.long, na.rm=T), "\n");
	cat("* SD =",    colSds(phe.long, na.rm=T),"\n");
	idx.na <- which( rowSums(is.na(phe.long)) == NCOL(phe.long) );
	if( length(idx.na)>0) phe.long <- phe.long[ -idx.na, ,drop=F];

	phe.time <- NULL;
	if (!is.null(file.phe.time))
	{
		phe.time <- read.csv(file.phe.time, header=T, stringsAsFactors=F, row.names=1);
		cat("  PHE TIME =", file.phe.time, "\n");
		cat("* Individuals =", NROW(phe.time), "\n");
		cat("* Times =", NCOL(phe.time), "\n");
		cat("* Mean =",  colMeans(phe.time, na.rm=T), "\n");
		cat("* SD =",    colSds(phe.time, na.rm=T),"\n");
		idx.na <- which( rowSums( is.na(phe.time))==NCOL(phe.time) );
		if( length(idx.na)>0) phe.time <- phe.time[ -idx.na, ];
	}

	phe.cov <- read.csv(file.phe.cov, header=T, stringsAsFactors=F, row.names=1);
	cat("  PHE COV =", file.phe.cov, "\n");
	cat("* Individuals =", NROW(phe.cov), "\n");
	cat("* Covariate =", NCOL(phe.cov), "\n");
	cat("* Mean =",  colMeans(phe.cov, na.rm=T), "\n");
	cat("* SD =",    colSds(phe.cov, na.rm=T), "\n");

	ids.phe <- intersect(rownames(phe.long), rownames(phe.cov) );
	if(!is.null(phe.time))
		ids.phe <- intersect(ids.phe, rownames(phe.time) );

	ids.set <- intersect(ids.phe, get_gen_individuals(PF.gen) );
	cat("  COMMON Individuals=", length(ids.set), "\n");

	#eg. c(10:1)[match(c(4, 6,8,2,3), c(10:1))]
	idx.long <- match( ids.set, rownames(phe.long) );
	phe.long <- phe.long[idx.long, ,drop=F];

	idx.cov <- match( ids.set, rownames(phe.cov) );
	phe.cov <- phe.cov[idx.cov, , drop=F];

	if(!is.null(phe.time))
	{
		idx.time <- match( ids.set, rownames(phe.time) );
		phe.time <- phe.time[idx.time, ,drop=F];
	}

	sync_gen_individuals(PF.gen, ids.set)

	if( !is.null(phe.time) && !all( rownames(phe.long) == rownames(phe.time) ) )
		stop("! ID MATCH ERROR between PHE.LONG and PHE.TIME. \n");

	if (!( all( rownames(phe.long)==rownames(phe.cov)) && all( rownames(phe.long)== get_gen_individuals(PF.gen) ) ) )
		stop("! ID MATCH ERROR among 3 files( PHE.LONG, PHE.COV, PLINK.FAM). \n");

	return(list( phe.long=phe.long, phe.time=phe.time, phe.cov = phe.cov));
}
ZWang-Lab/LSKAT documentation built on May 10, 2019, 1:55 a.m.