R/LDhap.R
In LDlinkR: Calculating Linkage Disequilibrium (LD) in Human Population Groups of Interest

Documented in LDhap

# LDlinkR::LDhap

########### Secondary Function:  Called by primary function LDhap ##########
df_merge <- function(data_out, table_type, genome_build) {

  ### Count the number of snps listed in data_out ###
  for (i in 1:dim(data_out)[1]) {
    if (substr(data_out[i,], 1, 1)[1]=="#") {
      break
    }
  }
  num_of_snps <- i-1

  ### split data_out into two, then create new data.frames from pieces of these two ###
  # 1
  data_out_var <- data_out[1:num_of_snps,]                                         # create first of new data frames

  # 2
  data_out2 <- data_out[(num_of_snps+2):nrow(data_out),]                           # create second of new data frames
  colnames(data_out2) <- as.character(unlist(data_out2[1,]))                       # assign first row as new column names
  data_out2 <- data_out2[-1,]                                                      # remove first row, no longer needed

  # 3
  if (num_of_snps == 1) {                                                          # when num_of_snps = 1, no need to split column by delimiter
    data_out_hap <- data_out2                                                      # for consistency only
    names(data_out_hap)[1] <- as.character(unlist(data_out_var$RS_Number))         # change column name to match RS_number from data_out_var
    rownames(data_out_hap) <- NULL                                                 # Remove row names
    # return(data_out_hap)

  } else if (num_of_snps > 1) {
    tmp <- strsplit(as.character(data_out2$Haplotype),'_')                              # split first column by delimiter '_', creates a list
    data_out_hap <- with(data_out2, data.frame(t(sapply(tmp, `[`))))                    # create new data.frame, sapply()'s results need to be transposed
    data_out_hap <- cbind(data_out_hap, data_out2[,2:3])                                # combine columns #2 & 3 from data_out w/ data_out_hap
    names(data_out_hap)[1:num_of_snps] <- as.character(unlist(data_out_var$RS_Number))  # change column names to match RS_numbers from data_out_var
    rownames(data_out_hap) <- NULL                                                      # Remove row names
    # return(data_out_hap)
  }

  # Before eval 'table_type' arg, use regex [^<>]+ to change column names of data_out_var
  colnames(data_out_var) <- gsub("Position[^<>]+", paste("Position", genome_build, sep="_"), colnames(data_out_var))
  colnames(data_out_var) <- gsub("Allele[^<>]+", "Allele_Frequency", colnames(data_out_var))

  # Evaluate 'table_type' parameter
  if (table_type == "variant") {
    return(data_out_var)
  } else if (table_type == "haplotype") {
    return(data_out_hap)
  } else if (table_type == "both") {
    # combine data_out_var and data_out_hap into a list
    data_out_both <- list(data_out_var, data_out_hap)
    return(data_out_both)
  } else if (table_type == "merged") {
    # transpose data.frame
    data_out_hap_t <- as.data.frame(t(as.matrix(data_out_hap)))

    # combine haplotypes from data_out_hap_t
    df_all <- cbind(data_out_var, data_out_hap_t[c(1:num_of_snps),c(1:ncol(data_out_hap_t))])

    # remove extra rownames
    # rownames(df_all) <- c()

    # change column names
    names(df_all)[4] <- "Haplotypes"

    # create new data.frame that includes Count & Frequency
    df1 <- data_out_hap_t[(num_of_snps+1):nrow(data_out_hap_t),]
    # change row names to "Haplotype_Count" & "Haplotype Frequency"
    rownames(df1)[rownames(df1) %in%
                    c("Count", "Frequency")] <-
      c("Haplotype_Count", "Haplotype_Frequency")
    # Convert row names into first column
    df1 <- cbind(rownames(df1), data.frame(df1, row.names=NULL))

    # create empty data.frame
    df2 <- data.frame(matrix("   ", nrow = 2, ncol = 2))
    # combine df1 & df2
    df3 <- cbind(df2, df1)

    # row bind new data frame, df3, to df_all
    data_out_merged <- data.frame(mapply(c, df_all, df3))

    # change column names
    names(data_out_merged)[5:ncol(data_out_merged)] <- "  "

    # return data
    return(data_out_merged)
    # End else if
  }

  # End bracket
}
############################## End Sec. Function ##############################


########## Primary Function:  LDhap ##########
#' Calculates population specific haplotype frequencies of all
#' haplotypes observed for a list of query variants.
#'
#' @param snps list of between 1 - 30 variants, using an rsID or chromosome coordinate (e.g. "chr7:24966446")
#' @param pop a 1000 Genomes Project population, (e.g. YRI or CEU), multiple allowed, default = "CEU"
#' @param token LDlink provided user token, default = NULL, register for token at  \url{https://ldlink.nih.gov/?tab=apiaccess}
#' @param file Optional character string naming a path and file for saving results.  If file = FALSE, no file will be generated, default = FALSE.
#' @param table_type Choose from one of four options available to determine output
#' format type...`haplotype`, `variant`, `both` and `merged`. Default = "haplotype".
#' @param genome_build Choose between one of the three options...`grch37` for genome build GRCh37 (hg19),
#' `grch38` for GRCh38 (hg38), or `grch38_high_coverage` for GRCh38 High Coverage (hg38) 1000 Genome Project
#' data sets.  Default is GRCh37 (hg19).
#' @param api_root Optional alternative root url for API.
#'
#' @return a data frame or list
#' @importFrom httr GET content stop_for_status http_error
#' @importFrom utils capture.output read.delim write.table
#' @export
#'
#' @examples
#' \dontrun{LDhap(c("rs3", "rs4", "rs148890987"), "CEU", token = Sys.getenv("LDLINK_TOKEN"))}
#' \dontrun{LDhap("rs148890987", c("YRI", "CEU"), token = Sys.getenv("LDLINK_TOKEN"))}
#'
LDhap <- function(snps,
                  pop="CEU",
                  token=NULL,
                  file = FALSE,
                  table_type="haplotype",
                  genome_build = "grch37",
                  api_root="https://ldlink.nih.gov/LDlinkRest") {

   LD_config <- list(ldhap.url=paste0(api_root,"/ldhap"),
                    avail_pop=c("YRI","LWK","GWD","MSL","ESN","ASW","ACB",
                                "MXL","PUR","CLM","PEL","CHB","JPT","CHS",
                                "CDX","KHV","CEU","TSI","FIN","GBR","IBS",
                                "GIH","PJL","BEB","STU","ITU",
                                "ALL", "AFR", "AMR", "EAS", "EUR", "SAS"),
                    avail_table_type=c("haplotype", "variant", "both", "merged"),
                    avail_genome_build = c("grch37", "grch38", "grch38_high_coverage")
                   )

  url <- LD_config[["ldhap.url"]]
  avail_pop <- LD_config[["avail_pop"]]
  avail_table_type <- LD_config[["avail_table_type"]]
  avail_genome_build <- LD_config[["avail_genome_build"]]

  # ensure file option is a character string
  file <- as.character(file)

  # Define regular expressions used to check arguments for valid input below
  rsid_pattern <- "^rs\\d{1,}"
  # Syntax               Description
  # ^rs                  rsid starts with 'rs'
  # \\d{1,}              followed by 1 or more digits

  chr_coord_pattern <- "(^chr)(\\d{1,2}|X|x|Y|y):(\\d{1,9})$"
  # Syntax               Description
  # (^chr)               chromosome coordinate starts with 'chr'
  # (\\d{1,2}|X|x|Y|y)   followed by one or two digits, 'X', 'x', 'Y', 'y', to designate chromosome
  # :                    followed by a colon
  # (\\d{1,9})$          followed by 1 to 9 digits only to the end of string


  # Checking arguments for valid input
  if(!(length(snps) >= 1 & length(snps) <= 30)) {
    stop("Input is between 1 to 30 variants only.")
  }

  for(i in 1:length(snps)) {
    if(!((grepl(rsid_pattern, snps[i], ignore.case = TRUE)) | (grepl(chr_coord_pattern, snps[i], ignore.case = TRUE))))  {
      stop(paste("Invalid query format for variant: ",snps[i], ".", sep=""))
    }
  }

  if(!(all(pop %in% avail_pop))) {
    stop("Not a valid population code.")
  }

  if(length(pop) > 1) {
    pop=paste(unlist(pop), collapse = "%2B")
  }

  if(!(all(table_type %in% avail_table_type))) {
    stop("Not a valid option for table_type.")
  }

  # Ensure input for 'genome_build' is valid.
  if(length(genome_build) > 1) {
    stop("Invalid input.  Please choose only one available genome build.")
  }

  if(!(all(genome_build %in% avail_genome_build))) {
    stop("Not an available genome build.")
  }

  if(is.null(token)) {
    stop("Enter valid access token. Please register using the LDlink API Access tab: https://ldlink.nih.gov/?tab=apiaccess")
  }


  # Request body
  snps_to_upload <- paste(unlist(snps), collapse = "%0A")
  body <- list(paste("snps=", snps_to_upload, sep = ""),
               paste("pop=", pop, sep = ""),
               paste("genome_build=", genome_build, sep = ""),
               paste("token=", token, sep=""))

  # URL query string
  url_str <- paste(url, "?", paste(unlist(body), collapse = "&"), sep="")

  # before 'GET command', check if LDlink server is up and accessible...
  # if server is down pkg should fail gracefully with informative message (not error)
  if (httr::http_error(url)) { # if server is down use message (and not an error)
    message("The LDlink server is down or not accessible. Please try again later.")
    return(NULL)
  } else { # network is up then proceed
    message("\nLDlink server is working...\n")
  }

  # GET command, request to the web server
  raw_out <- httr::GET(url=url_str)
  httr::stop_for_status(raw_out)
  # Parse response object, raw_out
  data_out <- read.delim(textConnection(httr::content(raw_out, "text", encoding = "UTF-8")), header=T, as.is = T, sep="\t")

  # Check for error/warning in response data
  if(sum(grepl("error", data_out, ignore.case = TRUE), na.rm = TRUE)) {
    # subset rows in data_out that contain text 'error'
    error_msg <- subset(data_out, grepl("error", data_out[,1], ignore.case = TRUE))

    # delete any column names so that they don't go to output
    names(error_msg) <- NULL

    error_msg <- paste(error_msg, collapse = " ")

    stop(error_msg)
  }

  if(sum(grepl("warning", data_out, ignore.case = TRUE), na.rm = TRUE)) {
    # subset rows in data_out that contain text 'error'
    warning_msg <- subset(data_out, grepl("warning", data_out[,1], ignore.case = TRUE))

    # delete any column names so that they don't go to output
    names(warning_msg) <- NULL

    message(warning_msg[grep("warning", data_out, value = FALSE, ignore.case = FALSE)])
  }

  # Call function to create a new data.frame by merging data returned from the LDlink web site
  data_out_hap <- df_merge(data_out, table_type, genome_build)

  # Evaluate 'file' option
  # File output depends on 'table_type' option selected, opt. 'both' output is returned
  # as a vector/list while the other three options return output as a data.frame. These
  # require different ways to write to file.
  if (file == FALSE) {
    return(data_out_hap)
  } else if (is.character(file)) {
    if (is.data.frame(data_out_hap)) {
      write.table(data_out_hap, file = file, quote = F, row.names = F, sep = "\t")
      cat(paste("\nFile saved to ",file,".\n\n", sep=""))
      return(data_out_hap)
    } else if (is.vector(data_out_hap)) {
      sink(file = file)
      print(data_out_hap)
      sink()
      cat(paste("\nFile saved to ",file,".\n\n", sep=""))
      return(data_out_hap)
    }
  }

}
############################## End Function ##############################

Any scripts or data that you put into this service are public.

LDlinkR documentation built on May 29, 2024, 4:32 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

LDlinkR
Calculating Linkage Disequilibrium (LD) in Human Population Groups of Interest

R/LDhap.R
In LDlinkR: Calculating Linkage Disequilibrium (LD) in Human Population Groups of Interest

Defines functions LDhap df_merge

Documented in LDhap

Try the LDlinkR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

LDlinkR Calculating Linkage Disequilibrium (LD) in Human Population Groups of Interest

R/LDhap.R In LDlinkR: Calculating Linkage Disequilibrium (LD) in Human Population Groups of Interest

Defines functions LDhap df_merge

Documented in LDhap

Try the LDlinkR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

LDlinkR
Calculating Linkage Disequilibrium (LD) in Human Population Groups of Interest

R/LDhap.R
In LDlinkR: Calculating Linkage Disequilibrium (LD) in Human Population Groups of Interest