#' Import new data into DoOR
#'
#' Import or update new data and update
#' \code{door_global_normalization_weights}, \code{door_response_range},
#' \code{odor}, \code{ORs} and receptor data frames.
#'
#' \code{\link{import_new_data}} is used to import new data into database. If
#' the data contains a new receptor or ORN, then build a new data frame for this
#' receptor or ORN. If the data contains a receptor that is already present in
#' database, then merge the imported data into old data frame. The information
#' (e.g. response range, how many receptors and odors were measured from given
#' study) will be integrated into data \code{door_response_range}, \code{odor},
#' \code{ORs} and \code{door_global_normalization_weights}. If an existing study
#' is imported, \code{\link{remove_study}} will be run first in order to perform
#' an update.
#'
#' @param file.name character string, the name of given file that contains
#' response values of one or more odorant receptors, either a .csv or .txt
#' file.
#' @param dataFormat data frame, a data frame does not contain any response
#' value but odorant information.
#' @param odor_data data frame, contains the odorant information.
#' @param weightGlobNorm data matrix, indicates whether given receptor has been
#' measured by given study.
#' @param responseRange data frame, contains the information about response
#' range of each study and how many odors have been measured in each study.
#' @param receptors data frame, contains the receptor and OSN names and their
#' expression.
#' @param ident the identifier used for matching, usually the InChIKey is used.
#' @param round the number of digits the imported values are rounded to.
#' @author Shouwen Ma <\email{shouwen.ma@@uni-konstanz.de}>
#' @author Daniel Münch <\email{daniel.muench@@uni-konstanz.de}>
#' @aliases importNewData import_new_data
#' @export
#' @importFrom stats na.omit
#' @importFrom utils read.csv read.table
#' @keywords data
#' @examples
#' \dontrun{
#' import new data named "odorantResponses_Orx.txt" into database and update the
#' support data.
#' library(DoOR.data)
#' import_new_data(file.name = "odorantResponses_Orx.csv")
#' }
#'
import_new_data <- function(file.name,
dataFormat = door_default_values("door_data_format"),
odor_data = door_default_values("odor"),
weightGlobNorm = door_default_values("door_global_normalization_weights"),
responseRange = door_default_values("door_response_range"),
receptors = door_default_values("ORs"),
ident = door_default_values("ident"),
round = 3) {
if (any(grep(".txt$", file.name))) {
imported.data <- read.table(file.name)
file.name <- gsub(".txt$", "", file.name)
}
if (any(grep(".csv$", file.name))) {
imported.data <- read.csv(file.name)
file.name <- gsub(".csv$", "", file.name)
}
if (!("Name" %in% colnames(imported.data)) |
!("InChIKey" %in% colnames(imported.data)))
stop("'InChIKey' or 'Name' column missing, these two are mandatory!")
# check for already existing studies
if (any(file.name %in% responseRange$study)) {
existing <- which(file.name %in% responseRange$study)
existing <- file.name[existing]
for (i in existing)
remove_study(i)
# get updated versions from .globalEnv
responseRange <- door_response_range
weightGlobNorm <- door_global_normalization_weights
warning(
paste(
'The following studies were already existing and were removed
prior to integrating the new data:\n',
paste(existing, collapse = ', ')
)
)
}
# check for missing identifiers
if (any(is.na(imported.data[, ident]))) {
nas <- which(is.na(imported.data[, ident]))
imported.data <- droplevels(imported.data[-nas, ])
warning('We found ',
length(nas),
' missing identifier(s),
this data was removed!',
call. = FALSE)
}
# check for duplicated identifiers
if (any(duplicated(tolower(imported.data[, ident]))))
stop('There are duplicated identifiers in the new dataset,
please solve this first.')
# convert CID to character
whichCIDCol <- grep("CID", names(imported.data))
if (!is.na(whichCIDCol[1])) {
imported.data[, whichCIDCol] <- factor(imported.data[, whichCIDCol])
}
# look for columns containing numerical values (response data)
nv <- as.numeric(which(vapply(imported.data, is.numeric, logical(1))))
n <- length(nv)
receptor_file <- colnames(imported.data)[nv]
if (round != FALSE) {
imported.data[nv] <- round(imported.data[nv], round)
}
# update data matrix "door_global_normalization_weights"
dim_weightGlobNorm <- dim(weightGlobNorm)
weightGlobNorm[, dim_weightGlobNorm[2] + 1] <- NA
colnames(weightGlobNorm)[dim_weightGlobNorm[2] + 1] <- file.name
matchreceptor <- match(receptor_file, rownames(weightGlobNorm))
stats <-
list(
'importedReceptors' = length(receptor_file),
'newReceptors' = 0,
'updatedOdors' = 0,
'newOdors' = 0
)
if (any(is.na(matchreceptor))) {
whichNotmatch <- which(is.na(matchreceptor))
newReceptor <- receptor_file[whichNotmatch]
dim_weightGlobNorm <- dim(weightGlobNorm)
lastRow <- (dim_weightGlobNorm[1] + length(whichNotmatch))
seqlastRows <- ((dim_weightGlobNorm[1] + 1):lastRow)
weightGlobNorm[seqlastRows, dim_weightGlobNorm[2]] <- NA
rownames(weightGlobNorm)[seqlastRows] <- newReceptor
message(
paste(
newReceptor,
"has been added into
'door_global_normalization_weights'.",
collapse = '\n'
)
)
}
# update data frame "response range"
responseRange_new.file <- range(imported.data[, nv], na.rm = TRUE)
responseRange_new <- data.frame(
study = file.name,
min = responseRange_new.file[1],
max = responseRange_new.file[2],
n_odors = sum(apply(!is.na(imported.data[nv]), 1, sum) > 0)
)
# dim(imported.data)[1]) # changed as the old way also returned NAs
responseRange <- rbind(responseRange, responseRange_new)
# update data frame "odor" and door_data_format if new odor is available.
matchtoOdor <- match(imported.data[, ident], odor_data[, ident])
whichNA <- which(is.na(matchtoOdor))
stats$updatedOdors <- length(na.omit(matchtoOdor))
stats$newOdors <- length(whichNA)
if (!identical(odor_data[1:5], dataFormat))
stop(
"The odorant lists of data 'odor' and 'door_data_format' are not
identical. Please check them again."
)
if (is.na(whichNA[1])) {
message("There were no new odors imported.")
} else {
##########
# add new odor identifiers to 'odor' and 'door_data_format' ----------------
##########
newOdor <- as.character(imported.data[whichNA, "Name"])
message(
paste(
newOdor,
" - is a new odor. Data frames 'odor' and
'door_data_format' will be updated.",
collapse = '\n'
)
)
dim_odor <- dim(odor_data)
odor_data[(dim_odor[1] + length(whichNA)), ] <- NA
levels(odor_data$InChIKey) <- union(levels(odor_data$InChIKey),
levels(imported.data$InChIKey))
odor_data[(dim_odor[1] + (seq_along(whichNA))), "InChIKey"] <-
as.character(imported.data[whichNA, "InChIKey"])
levels(odor_data$Name) <-
union(levels(odor_data$Name), levels(imported.data$Name))
odor_data[(dim_odor[1] + (seq_along(whichNA))), "Name"] <-
as.character(imported.data[whichNA, "Name"])
if ('CAS' %in% colnames(imported.data)) {
levels(odor_data$CAS) <-
union(levels(odor_data$CAS), levels(imported.data$CAS))
odor_data[(dim_odor[1] + (seq_along(whichNA))), "CAS"] <-
as.character(imported.data[whichNA, "CAS"])
}
if ('CID' %in% colnames(imported.data)) {
levels(odor_data$CID) <-
union(levels(odor_data$CID), levels(imported.data$CID))
odor_data[(dim_odor[1] + (seq_along(whichNA))), "CID"] <-
as.character(imported.data[whichNA, "CID"])
}
if ('Class' %in% colnames(imported.data)) {
levels(odor_data$Class) <-
union(levels(odor_data$Class), levels(imported.data$Class))
odor_data[(dim_odor[1] + (seq_along(whichNA))), "Class"] <-
as.character(imported.data[whichNA, "Class"])
}
if ('InChI' %in% colnames(imported.data)) {
levels(odor_data$InChI) <-
union(levels(odor_data$InChI), levels(imported.data$InChI))
odor_data[(dim_odor[1] + (seq_along(whichNA))), "InChI"] <-
as.character(imported.data[whichNA, "InChI"])
}
if ('SMILES' %in% colnames(imported.data)) {
levels(odor_data$SMILES) <-
union(levels(odor_data$SMILES),
levels(imported.data$SMILES))
odor_data[(dim_odor[1] + (seq_along(whichNA))), "SMILES"] <-
as.character(imported.data[whichNA, "SMILES"])
}
dataFormat <- odor_data[1:5]
message(
"If data was provided, 'CAS', 'NAME', 'CID', 'Class',
'InChI' and 'SMILES' columns were updated."
)
}
# if there is a new receptor or ORN update data frame "ORs", NOTE:
# the AL projection pattern (door_mappings) should be added manually
match_receptor <- match(receptor_file, receptors[, "OR"])
what_is_new <- receptor_file[which(is.na(match_receptor))]
if (!is.na(what_is_new[1])) {
ORs_new <- data.frame(OR = what_is_new, expression = NA)
receptors <- rbind(receptors, ORs_new)
message("New receptor or ORN has been added to 'ORs',
please input the AL mapping manually.")
}
##########
# add new response data
##########
for (j in receptors[, "OR"]) {
target <-
try(get(j), silent = TRUE)
# try to get the data and assign it a names "target"
if (inherits(target, "try-error")) {
# if it can be done, there must be a new receptor
target <- dataFormat
message(paste(
j,
"is a new receptor or ORN. A new response data.frame was created."
))
}
matchOdor <- match(imported.data[, ident], target[, ident])
whichNA <- which(is.na(matchOdor))
if (is.na(whichNA[1])) {
assign(j, target, envir = .GlobalEnv)
} else {
dim_RD <- dim(target)
target[(dim_RD[1] + length(whichNA)), ] <- NA
levels(target$InChIKey) <-
union(levels(target$InChIKey),
levels(imported.data$InChIKey))
target[(dim_RD[1] + (seq_along(whichNA))), "InChIKey"] <-
as.character(imported.data[whichNA, "InChIKey"])
levels(target$Name) <-
union(levels(target$Name), levels(imported.data$Name))
target[(dim_odor[1] + (seq_along(whichNA))), "Name"] <-
as.character(imported.data[whichNA, "Name"])
if ('Class' %in% colnames(imported.data)) {
levels(target$Class) <-
union(levels(target$Class), levels(imported.data$Class))
target[(dim_RD[1] + (seq_along(whichNA))), "Class"] <-
as.character(imported.data[whichNA, "Class"])
}
if ('CAS' %in% colnames(imported.data)) {
levels(target$CAS) <-
union(levels(target$CAS), levels(imported.data$CAS))
target[(dim_RD[1] + (seq_along(whichNA))), "CAS"] <-
as.character(imported.data[whichNA, "CAS"])
}
if ('CID' %in% colnames(imported.data)) {
levels(target$CID) <-
union(levels(target$CID), levels(imported.data$CID))
target[(dim_odor[1] + (seq_along(whichNA))), "CID"] <-
as.character(imported.data[whichNA, "CID"])
}
# assign the target back to his real name
assign(j, target, envir = .GlobalEnv)
}
}
# import data
for (i in 1:n) {
column.name <- receptor_file[i] # receptor name
target <-
try(get(column.name), silent = TRUE)
# try to find a match receptor and load data from old database
# check if receptor is new
if (dim(target)[2] == 5)
stats$newReceptors <- stats$newReceptors + 1
assign(
column.name,
combine_data(
data1 = target,
data2 = imported.data,
by.data2 = column.name,
assigned.name = paste(file.name, sep = "")
),
envir = .GlobalEnv
)
message(paste(column.name, "has been imported."))
# update door_global_normalization_weights
dim_weightGlobNorm <- dim(weightGlobNorm)
match_receptor <-
match(column.name, rownames(weightGlobNorm))
weightGlobNorm[match_receptor, dim_weightGlobNorm[2]] <- 1
} # END for (i in 1:n)
# return the updates back to working enviroment
assign("door_data_format", dataFormat, envir = .GlobalEnv)
assign("door_global_normalization_weights",
weightGlobNorm,
envir = .GlobalEnv)
assign("door_response_range", responseRange, envir = .GlobalEnv)
assign("ORs", receptors, envir = .GlobalEnv)
assign("odor", odor_data, envir = .GlobalEnv)
message()
message(
paste(
'###################\n',
'Import statistics:\n',
stats$importedReceptors - stats$newReceptors,
'response profiles were updated,',
stats$newReceptors,
'new profiles were added to DoOR.\n',
stats$updatedOdors,
'odorants were updated,',
stats$newOdors,
'new odorants were added to DoOR.'
)
)
} # END program "import_new_data"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.