#'report_as_dataframe
#'
#'Function to transform a list of NCBI Virus Report metadata into a table.
#'
#'@usage report_as_dataframe(report, records = c(1:length(report)))
#'
#'@param report a list derived a vaccine report from NCBI Datasets.
#'
#'@param records a vector of indices to pull from the report.
#'
#'@returns A large dataframe with 23 variables containig metadata from NCBI Virus
#' report.
#'
#'@export report_as_dataframe
#Function to create a vector of elements from the report
# 'records' takes a vector of indices to parse from report
report_as_dataframe <- function(report, records = c(1:length(report))) {
#Read in .jsonl of data report
#filepath <- "../covid19_data/data_report.jsonl"
#report <- readLines(filepath)
#report <- lapply(report, rjson::fromJSON)
#records = c(1:length(report))
#get selected records
selected_records <- report[records]
#Empty data frame for holding selected records
report_df <- data.frame()
#Variables to keep track of progress
counter <- 1
report_length <- length(report)
message("There are ", report_length, " records in given report.")
message(length(selected_records), " records have been selected.")
message("\nConverting report from list to data frame...")
for(record in selected_records) {
#Progress report for very large data sets
prcnt_prgrs <- (counter / length(selected_records)) * 100
message(sprintf("Progress: %.2f%% \r", prcnt_prgrs), appendLF = FALSE)
# Create vector of values ##################################################
#Description of the schema can be found here:
# https://www.ncbi.nlm.nih.gov/datasets/docs/reference-docs/data-reports/virus/#ncbi-datasets-v1alpha1-reports-VirusAssembly
accession <- pass_value(record$accession)
completeness <- pass_value(record$completeness)
geneCount <- pass_value(record$geneCount)
isAnnotated <- pass_value(record$isAnnotated)
isolate_collectionDate <- pass_value(record$isolate$collectionDate)
isolate_name <- pass_value(record$isolate$name)
isolate_source <- pass_value(record$isolate$source)
length <- pass_value(record$length)
bioProjects <- pass_value(record$bioprojects)
geo_Location <- pass_value(record$location$geographicLocation)
geo_Region <- pass_value(record$location$geographicRegion)
maturePeptideCount <- pass_value(record$maturePeptideCount)
molType <- pass_value(record$molType)
nucleotide_accessionVersion <- pass_value(record$nucleotide$accessionVersion)
nucleotide_seqID <- pass_value(record$nucleotide$seqId)
nucleotide_sequenceHash <- pass_value(record$nucleotide$sequenceHash)
nucleotide_title <- pass_value(record$nucleotide$title)
nucleotideCompleteness <- pass_value(record$nucleotideCompleteness)
proteinCount <- pass_value(record$proteinCount)
releaseDate <- pass_value(record$releaseDate)
sourceDatabase <- pass_value(record$sourceDatabase)
updateDate <- pass_value(record$updateDate)
virus_sciName <- pass_value(record$virus$sciName)
virus_taxID <- pass_value(record$virus$taxId)
record_v <- c(accession, completeness, geneCount, isAnnotated, isolate_collectionDate,
isolate_name, isolate_source, length, bioProjects, geo_Location,
geo_Region,maturePeptideCount, molType, nucleotide_accessionVersion,
nucleotide_seqID,nucleotide_sequenceHash, nucleotide_title,
nucleotideCompleteness,proteinCount, releaseDate, sourceDatabase,
updateDate, virus_sciName,virus_taxID)
# Combine into Data Frame ##################################################
report_df <- rbind(report_df, record_v)
#Handle increments
counter <- counter + 1
}
#Set names of columns for access
colnames(report_df) <- c("accession", "completeness", "geneCount", "isAnnotated",
"isolate_collectionDate", "isolate_name", "isolate_source",
"length", "bioProjects", "geo_Location", "geo_Region", "maturePeptideCount",
"molType", "nucleotide_accessionVersion", "nucleotide_seqID",
"nucleotide_sequenceHash", "nucleotide_title", "nucleotideCompleteness",
"proteinCount", "releaseDate", "sourceDatabase", "updateDate",
"virus_sciName", "virus_taxID")
#Convert specified columns from char to usable data type
report_df$completeness <- as.factor(report_df$completeness)
report_df$geneCount <- as.numeric(report_df$geneCount)
report_df$isolate_collectionDate <- as.Date(report_df$isolate_collectionDate,
format = "%Y-%m-%d")
report_df$isolate_source <- as.factor(report_df$isolate_source)
report_df$length <- as.numeric(report_df$length)
report_df$maturePeptideCount <- as.numeric(report_df$maturePeptideCount)
report_df$nucleotideCompleteness <- as.factor(report_df$nucleotideCompleteness)
report_df$releaseDate <- as.Date(report_df$releaseDate, format = "%Y-%m-%d")
report_df$updateDate <- as.Date(report_df$updateDate, format = "%Y-%m-%d")
#Finish message
message("\nFinished converting report from list to data frame")
return(report_df)
}
#END of report_as_dataframe()
#Helper function, forces NA values to be passed into Dataframe for missing values
pass_value <- function(x) {
if(is.null(x)) {
return(NA)
} else {
return(x)
}
}
#END of pass_value
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.