alliance | R Documentation |
Text mining RefSeq description, PubMed, BugSigDB and the other manually curated textual data.
alliance(
geneList,
alliancePath = "GENE-DESCRIPTION-TSV_HUMAN.tsv",
keyType = "SYMBOL",
excludeFreq = 2000,
exclude = "frequency",
filterMax = FALSE,
excludeType = ">",
tfidf = FALSE,
genePlotNum = 10,
preserve = FALSE,
takeMax = FALSE,
additionalRemove = NA,
onlyCorpus = FALSE,
madeUpper = c("dna", "rna"),
organism = 9606,
pal = c("blue", "red"),
numWords = 30,
scaleRange = c(5, 10),
autoScale = FALSE,
showLegend = FALSE,
orgDb = org.Hs.eg.db,
edgeLabel = FALSE,
naEdgeColor = "grey50",
cooccurrence = FALSE,
pvclAlpha = 0.95,
cl = FALSE,
ngram = 1,
plotType = "network",
onlyTDM = FALSE,
stem = FALSE,
colorText = FALSE,
corThresh = 0.2,
genePlot = FALSE,
autoThresh = TRUE,
genePathPlot = NULL,
genePathPlotSig = 0.05,
tag = "none",
layout = "nicely",
edgeLink = TRUE,
deleteZeroDeg = TRUE,
enrich = NULL,
topPath = 10,
tagWhole = FALSE,
mergeCorpus = NULL,
numOnly = TRUE,
madeUpperGenes = TRUE,
onWholeDTM = FALSE,
pre = TRUE,
takeMean = FALSE,
tagPalette = NULL,
collapse = FALSE,
addFreqToGene = FALSE,
useUdpipe = FALSE,
normalize = FALSE,
fontFamily = "sans",
udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
scaleFreq = NULL,
colorize = FALSE,
geneColor = "grey",
argList = list(),
useggwordcloud = TRUE,
wcScale = 10,
catColors = NULL,
discreteColorWord = FALSE,
useSeed = 42,
scaleEdgeWidth = c(1, 3),
filterByGO = FALSE,
docsum = FALSE,
absolute = TRUE,
corOption = list()
)
bugsigdb(
mbList,
excludeFreq = 1000,
exclude = "frequency",
excludeType = ">",
normalize = FALSE,
takeMean = FALSE,
additionalRemove = NA,
tfidf = FALSE,
target = "title",
apiKey = NULL,
takeMax = FALSE,
pre = FALSE,
pvclAlpha = 0.95,
numOnly = TRUE,
madeUpper = c("dna", "rna"),
redo = NULL,
fontFamily = "sans",
pal = c("blue", "red"),
numWords = 15,
preserve = FALSE,
metab = NULL,
metThresh = 0.2,
curate = TRUE,
abstArg = list(),
tagPalette = NULL,
metCol = NULL,
scaleRange = c(5, 10),
showLegend = FALSE,
ecPlot = FALSE,
edgeLabel = FALSE,
mbPlot = FALSE,
onlyTDM = FALSE,
ecFile = NULL,
upTaxFile = NULL,
filterMax = FALSE,
mbColor = "grey",
useUdpipe = FALSE,
colorize = FALSE,
cooccurrence = FALSE,
udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
scaleFreq = NULL,
ngram = 1,
plotType = "network",
disPlot = FALSE,
onWholeDTM = FALSE,
naEdgeColor = "grey50",
useggwordcloud = TRUE,
wcScale = 10,
addFreqToMB = FALSE,
catColors = NULL,
useSeed = 42,
discreteColorWord = FALSE,
colorText = FALSE,
corThresh = 0.2,
tag = "none",
tagWhole = FALSE,
stem = FALSE,
layout = "nicely",
edgeLink = TRUE,
deleteZeroDeg = TRUE,
cl = FALSE,
autoThresh = TRUE,
argList = list(),
docsum = FALSE,
absolute = TRUE,
corOption = list(),
cache = TRUE
)
manual(
df,
madeUpper = NULL,
useFil = NA,
filType = "above",
cooccurrence = FALSE,
filNum = 0,
useQuanteda = FALSE,
quantedaArgs = list(),
pvclAlpha = 0.95,
numOnly = TRUE,
tfidf = FALSE,
cl = FALSE,
pal = c("blue", "red"),
numWords = 30,
scaleRange = c(5, 10),
scaleFreq = NULL,
showLegend = FALSE,
plotType = "network",
colorText = FALSE,
corThresh = 0.2,
layout = "nicely",
tag = "none",
tagWhole = FALSE,
onlyCorpus = FALSE,
onlyTDM = FALSE,
queryColor = "grey",
edgeLabel = FALSE,
edgeLink = TRUE,
ngram = 1,
colorize = FALSE,
tagPalette = NULL,
preserve = FALSE,
takeMax = FALSE,
catColors = NULL,
deleteZeroDeg = TRUE,
additionalRemove = NA,
naEdgeColor = "grey50",
normalize = FALSE,
takeMean = FALSE,
queryPlot = FALSE,
collapse = FALSE,
onWholeDTM = FALSE,
stem = FALSE,
argList = list(),
useUdpipe = FALSE,
discreteColorWord = FALSE,
autoThresh = TRUE,
useggwordcloud = TRUE,
wcScale = 10,
fontFamily = "sans",
addFreqToNonWords = FALSE,
filterByGO = FALSE,
docsum = FALSE,
absolute = TRUE,
corOption = list(),
udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
useSeed = 42
)
pubmed(
queries,
useRawQuery = FALSE,
redo = NULL,
madeUpper = c("dna", "rna"),
target = "abstract",
useFil = NA,
filType = "above",
filNum = 0,
sortOrder = "relevance",
fontFamily = "sans",
pvclAlpha = 0.95,
numOnly = TRUE,
delim = "OR",
apiKey = NULL,
tfidf = FALSE,
cl = FALSE,
autoThresh = TRUE,
pal = c("blue", "red"),
numWords = 30,
scaleRange = c(5, 10),
showLegend = FALSE,
plotType = "network",
colorText = FALSE,
quote = FALSE,
corThresh = 0.2,
layout = "nicely",
tag = "none",
tagWhole = FALSE,
onlyCorpus = FALSE,
onlyTDM = FALSE,
retMax = 10,
edgeLabel = FALSE,
edgeLink = TRUE,
ngram = 1,
genePlot = FALSE,
scaleFreq = NULL,
onlyDf = FALSE,
tagPalette = NULL,
preserve = FALSE,
takeMax = FALSE,
catColors = NULL,
perQuery = FALSE,
discreteColorWord = FALSE,
useUdpipe = FALSE,
udpipeOnlyFreq = FALSE,
udpipeOnlyFreqNB = FALSE,
addFreqToQuery = FALSE,
naEdgeColor = "grey50",
cooccurrence = FALSE,
colorize = FALSE,
queryColor = "grey",
useggwordcloud = TRUE,
wcScale = 10,
distinguish_query = TRUE,
useSeed = 42,
udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
normalize = FALSE,
takeMean = FALSE,
absolute = TRUE,
corOption = list(),
deleteZeroDeg = TRUE,
additionalRemove = NA,
orgDb = org.Hs.eg.db,
onlyGene = FALSE,
filterByGO = FALSE,
docsum = FALSE,
pre = FALSE,
onWholeDTM = FALSE,
madeUpperGenes = TRUE,
stem = FALSE,
argList = list(),
dateRange = NULL,
cc0 = FALSE
)
refseq(
geneList,
keyType = "SYMBOL",
excludeFreq = 2000,
exclude = "frequency",
filterMax = FALSE,
excludeType = ">",
tfidf = FALSE,
genePlotNum = 10,
preserve = FALSE,
takeMax = FALSE,
additionalRemove = NA,
onlyCorpus = FALSE,
madeUpper = c("dna", "rna"),
organism = 9606,
pal = c("blue", "red"),
numWords = 30,
scaleRange = c(5, 10),
autoScale = FALSE,
showLegend = FALSE,
orgDb = org.Hs.eg.db,
edgeLabel = FALSE,
naEdgeColor = "grey50",
cooccurrence = FALSE,
pvclAlpha = 0.95,
cl = FALSE,
multiVals = "first",
ngram = 1,
plotType = "network",
onlyTDM = FALSE,
stem = FALSE,
colorText = FALSE,
corThresh = 0.2,
genePlot = FALSE,
autoThresh = TRUE,
autoNumWords = FALSE,
genePathPlot = NULL,
genePathPlotSig = 0.05,
tag = "none",
layout = "nicely",
edgeLink = TRUE,
deleteZeroDeg = TRUE,
enrich = NULL,
topPath = 10,
ora = FALSE,
tagWhole = FALSE,
mergeCorpus = NULL,
numOnly = TRUE,
madeUpperGenes = TRUE,
onWholeDTM = FALSE,
pre = TRUE,
takeMean = FALSE,
tagPalette = NULL,
collapse = FALSE,
addFreqToGene = FALSE,
useUdpipe = FALSE,
normalize = FALSE,
fontFamily = "sans",
udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
scaleFreq = NULL,
colorize = FALSE,
geneColor = "grey",
argList = list(),
useggwordcloud = TRUE,
wcScale = 10,
catColors = NULL,
discreteColorWord = FALSE,
useSeed = 42,
scaleEdgeWidth = c(1, 3),
splitByEA = NULL,
filterByGO = FALSE,
docsum = FALSE,
absolute = TRUE,
corOption = list()
)
geneList |
gene ID list |
alliancePath |
path to The Alliance of Genome Resources gene description file. default to "GENE-DESCRIPTION-TSV_HUMAN.tsv" |
keyType |
default to SYMBOL |
excludeFreq |
default to 5000 |
exclude |
"frequency" or "tfidf", |
filterMax |
Use pre-calculated filter based on max-values when excluding TfIdf Otherwise take sum. |
excludeType |
">" or "<", combined with 'exclude' and 'excludeFreq', e.g. filter the words with the pre-calculated frequency > 5000 |
tfidf |
use TfIdf when making TDM, default to FALSE. |
genePlotNum |
number of genes to be plotted (default: 10) |
preserve |
Try to preserve original characters. |
takeMax |
Take max values for each term in term-document matrix |
additionalRemove |
specific words to be excluded |
onlyCorpus |
return only corpus (tm). |
madeUpper |
make these words uppercase in resulting plot, default to c("rna" and "dna") |
organism |
organism ID to use in 'GeneSummary' |
pal |
palette for color gradient in correlation network. should be a vector of length two like c("red","blue"). |
numWords |
the number of words to be shown in the plot. When 'autoThresh' is TRUE, the number of this value will be shown. |
scaleRange |
scale for label and node size in the network. |
autoScale |
scale the label and node size automatically for the large network. |
showLegend |
whether to show legend in the network |
orgDb |
the database used to convert identifiers, default to org.Hs.eg.db. |
edgeLabel |
if TRUE, plot the edge label (default: FALSE) |
naEdgeColor |
edge colors for NA values (linking query with the category other than text) |
cooccurrence |
default to FALSE, if TRUE, use cooccurrence instead of correlation. |
pvclAlpha |
alpha value for pvpick() |
cl |
for parPvclust, parallel clustering can be performed |
ngram |
N-gram specification, default to 1. |
plotType |
"wc" or "network", default to "network" |
onlyTDM |
return only TDM (tm). |
stem |
whether to use stemming when making corpus. |
colorText |
color text label based on frequency in the network |
corThresh |
the correlation (cooccurrence) threshold. |
genePlot |
plot associated genes (default: FALSE) This option first calculates the high-frequent words, and subsequently calculates the occurrences of these words in each gene, and prioritize the genes by them. |
autoThresh |
automatically choose thresholding value to show the 'numWords', when deleteZeroDeg (deleting no-connected words) is TRUE, which is default. |
genePathPlot |
plot associated genes and pathways (default: NULL) Must be "kegg" or "reactome", automatically set genePlot to TRUE. |
genePathPlotSig |
threshold for adjusted p-values (default: 0.05) |
tag |
perform pvclust on words and colorlize them in wordcloud or network argument of those accepted in pvclust 'method.dist' option, like "correlation". Default to "none", which performs no tagging. |
layout |
the layout for the network, defaul to "nicely". It can be one of the layouts implemented in 'igraph' and 'ggraph', such as 'kk' (Kamada-Kawai), 'nicely' (automatic selection of algorithm), 'drl' (the force-directed DrL layout). The options are available at: https://igraph.org/r/doc/layout_.html |
edgeLink |
if FALSE, use geom_edge_diagonal. if TRUE, geom_edge_link. Default to TRUE. |
deleteZeroDeg |
delete zero degree node from plot in the network |
enrich |
currently, only 'reactome' and 'kegg' is supported. |
topPath |
how many pathway descriptions are included in text analysis, sorted by p-values in the results. |
tagWhole |
whether to perform pvclust on whole matrix or subset of the matrix. |
mergeCorpus |
specify multiple corpus if intend to combine them. like PubMed information and RefSeq summary |
numOnly |
delete number only (not deleting XXX123, but delete only the number) |
madeUpperGenes |
make genes upper case automatically (default to TRUE) This uses the 'SYMBOL' key in 'orgDb'. |
onWholeDTM |
calculate correlation network on whole dataset or top-words specified by numWords. |
pre |
remove preset filtering words. |
takeMean |
take mean values for each term in term-document matrix. |
tagPalette |
tag palette when 'tag' is TRUE. It is also used for dependency network using udpipe, and tagging colorization for word cloud. Default to NULL, which indicates automatically set. |
collapse |
default to FALSE, collapse all the sentences. |
addFreqToGene |
add pseudo frequency to gene in genePlot, default to FALSE. |
useUdpipe |
use udpipe to make a dependency network. |
normalize |
sum normalize the term frequency document-wise. |
fontFamily |
font family to use, default to "sans". |
udpipeModel |
udpipe model file name. |
scaleFreq |
default to NULL, scale the value if specified |
colorize |
color the word nodes by their frequency, and the other nodes by their category. if colorize=FALSE and addFreqToGene=TRUE, gene nodes are colorized according to the minimum frequency of the words in the network |
geneColor |
color for associated genes with words (used when tag or colorize option is TRUE) |
argList |
parameters to pass to wordcloud() or ggwordcloud() |
useggwordcloud |
default to TRUE, otherwise use 'wordcloud' function. |
wcScale |
scaling size for ggwordcloud |
catColors |
colors for words and texts when colorize is TRUE and discreteColorWord is TRUE |
discreteColorWord |
colorize words by "Words" category, not frequency. |
useSeed |
random seed |
scaleEdgeWidth |
scale for edge width |
filterByGO |
filter the results to the words obtained from GO terms, while preserving the number of words to be shown |
docsum |
if TRUE, convert the term-document matrix to binary. |
absolute |
calculate absolute correlation value |
corOption |
passed to 'cor' function, like list("method"="kendall") |
mbList |
microbe ID list |
target |
"title" or "abstract" |
apiKey |
api key for eutilities |
redo |
if plot in other parameters, input the previous list |
metab |
tibble of metabolite - taxon association |
metThresh |
threshold of association |
curate |
include articles in bugsigdb |
abstArg |
passed to PubMed function when using curate=FALSE |
metCol |
metabolite data frame column name in the order of "candidate taxon", "metabolite", "quantitative values for thresholding" |
ecPlot |
plot link between enzyme and microbes this option requires two files to be passed to enzyme() and getUPTax(). |
mbPlot |
plot microbe names |
ecFile |
enzyme database file |
upTaxFile |
UniProt taxonomy file |
mbColor |
color for Microbes when tagPalette or catColors is not specified |
disPlot |
plot diseases |
addFreqToMB |
add pseudo frequency to microbes in mbPlot |
cache |
Caching when BugSigDB is downloaded (default to TRUE) |
df |
manual document data.frame (must have column 'text') or vector of text. If 'query' column and other columns are present, regards them as category related to the text on the same row. |
useFil |
filter based on "GS_TfIdf" (whole gene summary tf-idf) or "BSDB_TfIdf" (whole bugsigdb tf-idf) |
filType |
"above" or "below" |
filNum |
specify filter tfidf |
useQuanteda |
use quanteda functions to generate |
quantedaArgs |
list of arguments to be passed to tokens() |
queryColor |
color for associated queries with words |
queryPlot |
plot the query in the graph in relation with the words |
addFreqToNonWords |
add pseudo-frequency corresponding to minimum frequency of the words to nodes other than words |
queries |
query ID list |
useRawQuery |
if you would like to send the query as is, please set this option to TRUE. |
sortOrder |
sort order, passed to rentrez function |
delim |
delimiter for queries |
quote |
whether to quote the queries |
retMax |
how many items are to be retlieved? |
onlyDf |
return only the raw data.frame of searching PubMed |
perQuery |
search for the queries one by one recursively, not using 'delim'. |
udpipeOnlyFreq |
when using udpipe, include only high-frequent words |
udpipeOnlyFreqNB |
when using udpipe, include only the neighbors of high-frequent words |
addFreqToQuery |
add pseudo-frequency to query node |
distinguish_query |
if TRUE, distinguish query and returned texts by appending (Q) on query |
onlyGene |
plot only the gene symbol (orgDb with SYMBOL key can be used) |
dateRange |
if specified, restrict the range of publication date. Must be the two-length vector, like 'c("2013/1/1", "2023/1/1")' |
cc0 |
effective only in 'pubmed' function. The prefetched data in 'pubmedMini' package is used for gene query. |
multiVals |
passed to mapIds function when converting ENTREZID to SYMBOL when genePlot is on. |
autoNumWords |
determine the number of words to be shown by ORA, default to FALSE. |
ora |
perform over-representation analysis or not (experimental) |
splitByEA |
automatically split the genes based on significant enrichment analysis results, and returns the list of object for each term. Default to NULL. Must be 'kegg' or 'reactome', in which the function performs over-representation analysis by enrichKEGG or enrichPathway in clusterProfiler and ReactomePA. |
The main functions of the biotextgraph package. The functions accepts
a character vector of biological entities (such as gene identifiers)
and returns the summarized statistics and visualization
contained in biotext
object.
'biotext' class object
geneList <- c("DDX41","PNKP","ERCC1","IRF3","XRCC1")
## Not run: alliance(geneList)
mbList <- c("Veillonella dispar")
## Not run:
bugsigdb(mbList, plotType="wc")
## End(Not run)
ret <- refseq("DDX41", plotType="wc")
manual(getSlot(ret, "rawText")$Gene_summary, plotType="wc")
## Not run: pubmed("DDX41")
geneList <- c("DDX41","PNKP","ERCC1","IRF3","XRCC1")
refseq(geneList)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.