R/cohort.R
In CALIBERdatamanage: Data Management Tools for CALIBER Datasets

Documented in as.cohort cohort is.cohort merge.cohort print.cohort subset.cohort summary.cohort

# New S3 object type for cohort - one row per patient
# Alphabetical column order

cohort <- function(x, idcolname = c('patid', 'anonpatid', 'id'),
	description = NULL){
	# Designates a data.table as a 'cohort'
	# Creates a data dictionary
	# Data.tables are not copied
	if (is.cohort(x)){
		out <- x
	} else if (is.data.table(x)){
		out <- x
	} else if (is.ffdf(x)){
		out <- x
		setattr(out, 'class', c('cohort', class(out)))
	} else {
		out <- as.data.table(as.data.frame(x))
	}

	# Find ID column
	if (is.null(attr(x, 'idcolname'))){
		idcolname <- idcolname[which(idcolname %in% colnames(out))]
	} else {
		idcolname <- as.character(attr(x, 'idcolname'))
	}
	names(idcolname) <- NULL
	if (length(idcolname) == 1){
		setattr(out, 'idcolname', idcolname)
	} else if (length(idcolname) > 1){
		stop('Multiple potential ID columns, please specify')
	} else {
		stop('Invalid ID column name')
	}
	if (any(is.na(out[[idcolname]]))){
		stop('ID may not be missing')
	}
	if (length(unique(out[[idcolname]])) < nrow(out)){
		stop('ID must be unique')
	}

	if (is.null(description)){
		if (!is.null(attr(x, 'var.labels'))){
			if (length(attr(x, 'var.labels')) == ncol(x)){
				description <- data.frame(colname = colnames(x),
					description = attr(x, 'var.labels'),
					stringsAsFactors = FALSE)
			}
		}
	}

	if (is.null(description)){
		description <- attr(x, 'description')
	}

	if (is.null(description)){
		description <- data.frame(colname = setdiff(colnames(x), idcolname),
			description = '', stringsAsFactors = FALSE)
	} else {
		description <- data.frame(description)
		if (!identical(colnames(description), c('colname', 'description'))){
			stop('Description table must have two columns: colname and description')
		}
	}
	# Keep only the entries in description that are relevant to this cohort
	description <- description[description$colname %in% colnames(x), ]
	description <- description[order(description$colname), ]

	# Change the class to cohort
	if (!('cohort' %in% class(out))){
		setattr(out, 'class', c('cohort', class(out)))
	}
	
	# Set column order to be alphabetical (only works for data.tables)
	if (is.data.table(out)){
		setcolorder(out, c(idcolname, sort(setdiff(names(out), idcolname))))
		setkeyv(out, idcolname)
	}
	setattr(out, 'description', description)
	out
}

as.cohort <- function(x, ...){
	if (is.cohort(x)){
		x
	} else {
		cohort(x, ...)
	}
}

is.cohort <- function(x){
	if ('cohort' %in% class(x)){
		if (is.null(attr(x, 'idcolname'))){
			# If there is no ID column it is not a cohort
			setattr(x, 'class', setdiff(class(x), 'cohort'))
			FALSE
		} else {
			if (attr(x, 'idcolname') %in% colnames(x)){
				TRUE
			} else {
				# If the ID column is not included it is not a cohort
				setattr(x, 'class', setdiff(class(x), 'cohort'))
				FALSE
			}
		}
	} else {
		FALSE
	}
}

print.cohort <- function(x, ...){
	is_a_cohort <- is.cohort(x)
	classes <- class(x)
	# Remove cohort class designation
	data.table::setattr(x, 'class', classes[classes != 'cohort'])
	if (!is_a_cohort){
		print(x)
		# Don't restore classes if it is not a cohort
	} else {
		# Print the summary and then the cohort file itself
		summary.cohort(x)
		cat('\nDATA\n')
		if (is.data.table(x)){
			# Changing the class to data.table in order to invoke
			# print.data.table
			data.table::setattr(x, 'class', c('data.table', 'data.frame'))
			print(x)
		} else if (is.ffdf(x)){
			print(x)
		}
		# Restore original classes
		data.table::setattr(x, 'class', classes)
	}
}

summary.cohort <- function(object, ...){
	if (!(is.cohort(object))){
		# Remove cohort class designation
		setattr(object, 'class', setdiff(class(object), 'cohort'))
		summary(object)
	} else {
		# Prints a summary of a cohort
		if (is.data.table(object)){
			# Order the columns so that ID column is at the
			# front and others are alphabetical
			setcolorder(object, c(attr(object, 'idcolname'),
				sort(setdiff(names(object), attr(object, 'idcolname')))))
			cat('Data.table cohort with ') 
		} else if (is.ffdf(object)) {
			cat('FFDF cohort with ')
		}
		cat(nrow(object), 'patients and ' %&% ncol(object) %&%
			' columns; ID column =', attr(object, 'idcolname'), '\n')

		getclass <- function(colnames){
			# Get the class of columns in a data.frame, returning
			# NULL if the column does not exist
			trygetclass <- function(z){
				if (z %in% colnames(object)){
					class(object[[z]])[1]
				} else {
					'NULL'
				}
			}
			sapply(colnames, trygetclass)
		}

		description <- attr(object, 'description')
		
		if (!is.null(description)){
			if (nrow(description) > 0){
				cat('\nCOLUMN DESCRIPTIONS\n')
				# Add columns without a description
				description <- merge(description,
					data.frame(colname = colnames(object)), by = 'colname',
					all = TRUE)
				description$description[is.na(description$description)] <- ''
				
				if (is.ffdf(object)){
					object <- as.data.table(as.data.frame(object[1, ]))
				}

				cat(paste(description$colname %&% ' (' %&%
					getclass(description$colname) %&% '): ' %&%
					truncateChar(description$description,
					getOption('width') - nchar(description$colname) -
					nchar(getclass(description$colname)) - 7), collapse='\n'))
				cat('\n')
			}
		}
	}
}

truncateChar <- function(x, maxchar){
	# Truncates a character vector so that each element does not have more
	# than a specified number of characters, adding ... to the end of 
	# truncated terms
	# Arguments: x - character string to truncate
	#            maxchar - length to truncate to
	
	# Ensure maxchar is a vector of the same length as x
	maxchar <- maxchar + rep(0, length(x))

	# Split into individual lines
	for (i in 1:length(x)){
		xlines <- strsplit(x[i], '\n')[[1]]
		if (length(xlines) > 1){
			nindent <- min(nchar(strsplit(xlines[1], ':')[[1]][1]) + 2,
				getOption('width') - 20)
			xlines[2:length(xlines)] <- paste(rep(' ', nindent),
				collapse = '') %&% xlines[2:length(xlines)]
		}
		convert <- nchar(xlines) > maxchar[i]
		xlines[convert] <- substr(xlines[convert], 1, maxchar[i] - 3) %&% '...'
		x[i] <- paste(xlines, collapse = '\n')[1]
	}
	x
}

subset.cohort <- function(x, subset, select, ...){
	# S3 method for subsetting a cohort
	# Arguments: x - cohort
	#            subset - a logical expression for rows to keep,
	#                   default is to keep all rows.
	#            select - a character vector of columns to keep,
	#                   default is to keep all columns.
	#                   The ID column is always kept.
	
	# Ensure that as a minimum, code, medcode if GPRD,
	# term and category are kept. It is only valid as a codelist
	# if these columns are kept test
	if (!is.cohort(x)){
		setattr(x, 'class', setdiff(class(x), 'cohort'))		
		subset(x, subset, select)
	} else {
		if (missing(select)){
			select <- colnames(x)
		} else {
			select <- c(attr(x, 'idcolname'),
				sort(unique(setdiff(select, attr(x, 'idcolname')))))
		}
		
		if (is.data.table(x)){
			if (missing(subset)) {
				includeRow <- TRUE
			}	else {
				expr <- substitute(subset)
				includeRow <- eval(expr, x, parent.frame())
				if (!is.logical(includeRow)) 
					stop("'subset' must evaluate to a Boolean vector")
				includeRow[is.na(includeRow)] <- FALSE
			}
			out <- x[includeRow, select, with = FALSE]
		} else if (is.ffdf(x)){
			# subset.ffdf does not use ...
			out <- ffbase::subset.ffdf(x, subset)
			# If using the select argument, need to select the relevant
			# vectors by setting the others to NULL
			if (!missing(select)){
				if (is.logical(select) | is.numeric(select)){
					select <- colnames(out)[select]
				} 
				if (is.character(select) & length(select) > 0){
					# Remove unwanted columns
					remove <- setdiff(colnames(out), select)
					for (thecol in remove){
						out[[thecol]] <- NULL
					}
				}
			}
		}
		out <- cohort(out, idcolname = attr(x, 'idcolname'),
			description = attr(x, 'description')) 
		out <- purgeDescription(out)
		out
	}
}


merge.cohort <- function(x, y, by = attr(x, 'idcolname'), ...){
	# Merges two cohorts, warning if there are common columns
	# The ID column name must be identical
	# Combines the descriptions
	
	if (!is.cohort(x)){
		setattr(x, 'class', setdiff(class(x), 'cohort'))
		merge(x, y, by, ...)
	} else {
		
		if (is.cohort(x) & is.cohort(y)){
			if (attr(x, 'idcolname') != attr(y, 'idcolname')){
				stop('ID column names in x and y are different')
			}
		}
		
		# If one of the objects is a data.table, the other is converted
		# t oa data.table before merging
		if (is.data.table(x) & !is.data.table(y)){
			y <- as.data.table(y)
		}
		if (is.data.table(y) & !is.data.table(x)){
			x <- as.data.table(x)
		}

		if (length(intersect(colnames(x), colnames(y))) > 1){
			warning('Columns ' %&%
				paste(intersect(colnames(x), colnames(y)), collapse=', ') %&%
				' are in both cohort datasets')
		}

		classx <- class(x)
		classy <- class(y)

		# Simplify the classes to select the correct merge method
		if (is.ffdf(x) & is.ffdf(y)){
			setattr(x, 'class', 'ffdf')
			setattr(y, 'class', 'ffdf')
		}
		if (is.data.table(x) & is.data.table(y)) {
			setattr(x, 'class', c('data.table', 'data.frame'))
			setattr(y, 'class', c('data.table', 'data.frame'))
		}
		
		# Perform the merge
		out <- merge(x, y, by = by, ...)
		
		# Restore original classes
		setattr(x, 'class', classx)
		setattr(y, 'class', classy)
		
		out <- cohort(out, idcolname = attr(x, 'idcolname'),
			description <- rbind(attr(x, 'description'), attr(y, 'description')))
		out
	}
}

Any scripts or data that you put into this service are public.

CALIBERdatamanage documentation built on Nov. 23, 2021, 3 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CALIBERdatamanage
Data Management Tools for CALIBER Datasets

R/cohort.R
In CALIBERdatamanage: Data Management Tools for CALIBER Datasets

Defines functions merge.cohort subset.cohort truncateChar summary.cohort print.cohort is.cohort as.cohort cohort

Documented in as.cohort cohort is.cohort merge.cohort print.cohort subset.cohort summary.cohort

Try the CALIBERdatamanage package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CALIBERdatamanage Data Management Tools for CALIBER Datasets

R/cohort.R In CALIBERdatamanage: Data Management Tools for CALIBER Datasets

Defines functions merge.cohort subset.cohort truncateChar summary.cohort print.cohort is.cohort as.cohort cohort

Documented in as.cohort cohort is.cohort merge.cohort print.cohort subset.cohort summary.cohort

Try the CALIBERdatamanage package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CALIBERdatamanage
Data Management Tools for CALIBER Datasets

R/cohort.R
In CALIBERdatamanage: Data Management Tools for CALIBER Datasets