View source: R/clean_windows.R
clean_windows | R Documentation |
clean_windows
Method to use genomic sequences instead of peptides to
estimate synteny across genomes. This is completely distinct from the main
GENESPACE routines, relying on minimap2 alignments of windowed sequences.
Windowing increases speed relative to standard WGA–>SYRI methods, but comes
at a cost of precision. Like the main GENESPACE functions, this is to provide
a coarse view of synteny and subsequent local alignments should be used to
test for basepair-level differences.
clean_windows
clean_windows
minimap_4synteny
minimap_4synteny
check_minimap2install
check_minimap2install
subset_minChrSize
subset_minChrSize
flag_pafSynteny
flag_pafSynteny
calc_pafBlkCoords
calc_pafBlkCoords
read_paf
read_paf
window_fasta
window_fasta
clean_windows(
faFiles,
wd,
onlySameChrs = FALSE,
genomeIDs = NULL,
ploidy = 1,
stripFaNames = function(x) gsub("\\.fa(?:\\.gz)?$", "", basename(x)),
windowSize = 1000,
syntenicBlkSize = ceiling((2e+05/windowSize)/5),
syntenicHitRadius = 5,
syntenicBlkBpRadius = 250000,
minChrSize = windowSize * syntenicBlkSize,
nCores = 1,
asmPreset = "asm5",
minimap2call = "minimap2",
keepSecondary = FALSE,
mm2mode = "fast",
quantileThresh = 0.25,
overwrite = FALSE,
verbose = TRUE
)
minimap_4synteny(
faFile1,
faFile2,
genomeID1,
genomeID2,
windowSize = 1000,
minChrSize = 1e+06,
mm2Dir,
outDir,
asmPreset = "asm5",
minimap2call = "minimap2",
keepSecondary = FALSE,
nCores = 12,
mm2mode = "fast",
overwrite = FALSE,
verbose = TRUE,
quantileThresh = 0.25,
syntenicBlkSize = ceiling((1e+05/windowSize)/5),
syntenicHitRadius = 5,
syntenicBlkBpRadius = 1e+05
)
check_minimap2install(minimap2call)
subset_minChrSize(inFile, outFile, minChrSize, overwrite, verbose)
flag_pafSynteny(
paf,
syntenicBlkSize,
syntenicHitRadius,
syntenicBlkBpRadius,
windowSize
)
calc_pafBlkCoords(paf)
read_paf(x)
window_fasta(inFile, outFile, windowSize, stepSize, verbose, overwrite)
faFiles |
character vector coercible to a file path pointing to the genome assembly fasta files to use |
wd |
file.path for internal functions, not meant for direct use. |
onlySameChrs |
logical, for internal functions, not meant for direct use. |
genomeIDs |
character vector with the names for the genomes stored in fasta files |
ploidy |
integer vector corresponding to the ploidies of the genomes stored in the fasta files. Currently polyploids are not allowed. Setting to > 1 will result in an error. |
stripFaNames |
function applied to basename(faFiles) to get a shorter ID for each genome. Ignored if genomeIDs != NULL. |
windowSize |
integer, specifying the width of windows. Smaller windows are marginally faster and offer more resolution in unique sequences. Larger windows can be slower and are more precise in repetitive genomes, but also may miss more SVs. |
syntenicBlkSize |
integer, specifying the number of windows needed to form a syntenic block. By default, the smallest block that can be detected is 40kb, so 40 1kb blocks. If not specified, this will scale so that blocks are ~ 40kb. |
syntenicHitRadius |
integer, specifying the number of hits away from an initial anchor a syntenic anchor hit can exist. |
syntenicBlkBpRadius |
integer, the maximum basepair distance between two adjacent anchor hits for which those hits can be in the same syntenic block. |
minChrSize |
integer, the minimum size in basepairs for a sequence to be considered. |
nCores |
integer, number of parallel processes to run. |
asmPreset |
character string matching one of the minimap -x options |
minimap2call |
character string coercible to a file path point to the minimap2 program call. If in the path, can leave black or set as "minimap2" |
keepSecondary |
logical, specifying whether secondary hits should be retained. |
mm2mode |
character string either "fast" or "default". If fast, parameters minimap2 -k 25 -w 20 are added. |
quantileThresh |
numeric [0-1] specifying the stringency of culling for initial syntenic anchor hits. Lower numbers are more inclusive. |
overwrite |
logical, if results exist, should they be overwritten. |
verbose |
logical, should updates be printed to the console? |
faFile1 |
character, for internal functions, not meant for direct use. |
faFile2 |
character, for internal functions, not meant for direct use. |
genomeID1 |
character, for internal functions, not meant for direct use. |
genomeID2 |
character, for internal functions, not meant for direct use. |
mm2Dir |
file.path for internal functions, not meant for direct use. |
outDir |
file.path for internal functions, not meant for direct use. |
inFile |
character, for internal functions, not meant for direct use. |
outFile |
character, for internal functions, not meant for direct use. |
paf |
data.table, for internal functions, not meant for direct use. |
x |
character, for internal functions, not meant for direct use. |
stepSize |
numeric, for internal functions, not meant for direct use.
|
## Not run:
st <- "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/"
hg38url <- file.path(
st,
"000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz")
t2turl <- file.path(
st,
"009/914/755/GCA_009914755.3_T2T-CHM13v1.1/GCA_009914755.3_T2T-CHM13v1.1_genomic.fna.gz")
wd <- file.path("~/Downloads/test_genespace")
hg38file <- file.path(wd, "hg38.fa.gz")
t2tfile <- file.path(wd, "t2t.fa.gz")
hg38file1 <- file.path(wd, "hg38Chr1.fa.gz")
t2tfile1 <- file.path(wd, "t2tChr1.fa.gz")
if(!dir.exists(wd))
dir.create(wd)
options(timeout=1000)
download.file(url = hg38url, destfile = hg38file)
download.file(url = t2turl, destfile = t2tfile)
tmp <- Biostrings::readDNAStringSet(hg38file)[1]
names(tmp) <- "chr1"
Biostrings::writeXStringSet(tmp, filepath = hg38file1, compress = T)
tmp <- Biostrings::readDNAStringSet(t2tfile)[1]
names(tmp) <- "chr1"
Biostrings::writeXStringSet(tmp, filepath = t2tfile1, compress = T)
test <- clean_windows(
wd = wd,
nCores = 12,
faFiles = c(hg38file1, t2tfile1))
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.