generalf: refseq, alliance, pubmed, manual, bugsigdb

allianceR Documentation

refseq, alliance, pubmed, manual, bugsigdb

Description

Text mining RefSeq description, PubMed, BugSigDB and the other manually curated textual data.

Usage

alliance(
  geneList,
  alliancePath = "GENE-DESCRIPTION-TSV_HUMAN.tsv",
  keyType = "SYMBOL",
  excludeFreq = 2000,
  exclude = "frequency",
  filterMax = FALSE,
  excludeType = ">",
  tfidf = FALSE,
  genePlotNum = 10,
  preserve = FALSE,
  takeMax = FALSE,
  additionalRemove = NA,
  onlyCorpus = FALSE,
  madeUpper = c("dna", "rna"),
  organism = 9606,
  pal = c("blue", "red"),
  numWords = 30,
  scaleRange = c(5, 10),
  autoScale = FALSE,
  showLegend = FALSE,
  orgDb = org.Hs.eg.db,
  edgeLabel = FALSE,
  naEdgeColor = "grey50",
  cooccurrence = FALSE,
  pvclAlpha = 0.95,
  cl = FALSE,
  ngram = 1,
  plotType = "network",
  onlyTDM = FALSE,
  stem = FALSE,
  colorText = FALSE,
  corThresh = 0.2,
  genePlot = FALSE,
  autoThresh = TRUE,
  genePathPlot = NULL,
  genePathPlotSig = 0.05,
  tag = "none",
  layout = "nicely",
  edgeLink = TRUE,
  deleteZeroDeg = TRUE,
  enrich = NULL,
  topPath = 10,
  tagWhole = FALSE,
  mergeCorpus = NULL,
  numOnly = TRUE,
  madeUpperGenes = TRUE,
  onWholeDTM = FALSE,
  pre = TRUE,
  takeMean = FALSE,
  tagPalette = NULL,
  collapse = FALSE,
  addFreqToGene = FALSE,
  useUdpipe = FALSE,
  normalize = FALSE,
  fontFamily = "sans",
  udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
  scaleFreq = NULL,
  colorize = FALSE,
  geneColor = "grey",
  argList = list(),
  useggwordcloud = TRUE,
  wcScale = 10,
  catColors = NULL,
  discreteColorWord = FALSE,
  useSeed = 42,
  scaleEdgeWidth = c(1, 3),
  filterByGO = FALSE,
  docsum = FALSE,
  absolute = TRUE,
  corOption = list()
)

bugsigdb(
  mbList,
  excludeFreq = 1000,
  exclude = "frequency",
  excludeType = ">",
  normalize = FALSE,
  takeMean = FALSE,
  additionalRemove = NA,
  tfidf = FALSE,
  target = "title",
  apiKey = NULL,
  takeMax = FALSE,
  pre = FALSE,
  pvclAlpha = 0.95,
  numOnly = TRUE,
  madeUpper = c("dna", "rna"),
  redo = NULL,
  fontFamily = "sans",
  pal = c("blue", "red"),
  numWords = 15,
  preserve = FALSE,
  metab = NULL,
  metThresh = 0.2,
  curate = TRUE,
  abstArg = list(),
  tagPalette = NULL,
  metCol = NULL,
  scaleRange = c(5, 10),
  showLegend = FALSE,
  ecPlot = FALSE,
  edgeLabel = FALSE,
  mbPlot = FALSE,
  onlyTDM = FALSE,
  ecFile = NULL,
  upTaxFile = NULL,
  filterMax = FALSE,
  mbColor = "grey",
  useUdpipe = FALSE,
  colorize = FALSE,
  cooccurrence = FALSE,
  udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
  scaleFreq = NULL,
  ngram = 1,
  plotType = "network",
  disPlot = FALSE,
  onWholeDTM = FALSE,
  naEdgeColor = "grey50",
  useggwordcloud = TRUE,
  wcScale = 10,
  addFreqToMB = FALSE,
  catColors = NULL,
  useSeed = 42,
  discreteColorWord = FALSE,
  colorText = FALSE,
  corThresh = 0.2,
  tag = "none",
  tagWhole = FALSE,
  stem = FALSE,
  layout = "nicely",
  edgeLink = TRUE,
  deleteZeroDeg = TRUE,
  cl = FALSE,
  autoThresh = TRUE,
  argList = list(),
  docsum = FALSE,
  absolute = TRUE,
  corOption = list(),
  cache = TRUE
)

manual(
  df,
  madeUpper = NULL,
  useFil = NA,
  filType = "above",
  cooccurrence = FALSE,
  filNum = 0,
  useQuanteda = FALSE,
  quantedaArgs = list(),
  pvclAlpha = 0.95,
  numOnly = TRUE,
  tfidf = FALSE,
  cl = FALSE,
  pal = c("blue", "red"),
  numWords = 30,
  scaleRange = c(5, 10),
  scaleFreq = NULL,
  showLegend = FALSE,
  plotType = "network",
  colorText = FALSE,
  corThresh = 0.2,
  layout = "nicely",
  tag = "none",
  tagWhole = FALSE,
  onlyCorpus = FALSE,
  onlyTDM = FALSE,
  queryColor = "grey",
  edgeLabel = FALSE,
  edgeLink = TRUE,
  ngram = 1,
  colorize = FALSE,
  tagPalette = NULL,
  preserve = FALSE,
  takeMax = FALSE,
  catColors = NULL,
  deleteZeroDeg = TRUE,
  additionalRemove = NA,
  naEdgeColor = "grey50",
  normalize = FALSE,
  takeMean = FALSE,
  queryPlot = FALSE,
  collapse = FALSE,
  onWholeDTM = FALSE,
  stem = FALSE,
  argList = list(),
  useUdpipe = FALSE,
  discreteColorWord = FALSE,
  autoThresh = TRUE,
  useggwordcloud = TRUE,
  wcScale = 10,
  fontFamily = "sans",
  addFreqToNonWords = FALSE,
  filterByGO = FALSE,
  docsum = FALSE,
  absolute = TRUE,
  corOption = list(),
  udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
  useSeed = 42
)

pubmed(
  queries,
  useRawQuery = FALSE,
  redo = NULL,
  madeUpper = c("dna", "rna"),
  target = "abstract",
  useFil = NA,
  filType = "above",
  filNum = 0,
  sortOrder = "relevance",
  fontFamily = "sans",
  pvclAlpha = 0.95,
  numOnly = TRUE,
  delim = "OR",
  apiKey = NULL,
  tfidf = FALSE,
  cl = FALSE,
  autoThresh = TRUE,
  pal = c("blue", "red"),
  numWords = 30,
  scaleRange = c(5, 10),
  showLegend = FALSE,
  plotType = "network",
  colorText = FALSE,
  quote = FALSE,
  corThresh = 0.2,
  layout = "nicely",
  tag = "none",
  tagWhole = FALSE,
  onlyCorpus = FALSE,
  onlyTDM = FALSE,
  retMax = 10,
  edgeLabel = FALSE,
  edgeLink = TRUE,
  ngram = 1,
  genePlot = FALSE,
  scaleFreq = NULL,
  onlyDf = FALSE,
  tagPalette = NULL,
  preserve = FALSE,
  takeMax = FALSE,
  catColors = NULL,
  perQuery = FALSE,
  discreteColorWord = FALSE,
  useUdpipe = FALSE,
  udpipeOnlyFreq = FALSE,
  udpipeOnlyFreqNB = FALSE,
  addFreqToQuery = FALSE,
  naEdgeColor = "grey50",
  cooccurrence = FALSE,
  colorize = FALSE,
  queryColor = "grey",
  useggwordcloud = TRUE,
  wcScale = 10,
  distinguish_query = TRUE,
  useSeed = 42,
  udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
  normalize = FALSE,
  takeMean = FALSE,
  absolute = TRUE,
  corOption = list(),
  deleteZeroDeg = TRUE,
  additionalRemove = NA,
  orgDb = org.Hs.eg.db,
  onlyGene = FALSE,
  filterByGO = FALSE,
  docsum = FALSE,
  pre = FALSE,
  onWholeDTM = FALSE,
  madeUpperGenes = TRUE,
  stem = FALSE,
  argList = list(),
  dateRange = NULL,
  cc0 = FALSE
)

refseq(
  geneList,
  keyType = "SYMBOL",
  excludeFreq = 2000,
  exclude = "frequency",
  filterMax = FALSE,
  excludeType = ">",
  tfidf = FALSE,
  genePlotNum = 10,
  preserve = FALSE,
  takeMax = FALSE,
  additionalRemove = NA,
  onlyCorpus = FALSE,
  madeUpper = c("dna", "rna"),
  organism = 9606,
  pal = c("blue", "red"),
  numWords = 30,
  scaleRange = c(5, 10),
  autoScale = FALSE,
  showLegend = FALSE,
  orgDb = org.Hs.eg.db,
  edgeLabel = FALSE,
  naEdgeColor = "grey50",
  cooccurrence = FALSE,
  pvclAlpha = 0.95,
  cl = FALSE,
  ngram = 1,
  plotType = "network",
  onlyTDM = FALSE,
  stem = FALSE,
  colorText = FALSE,
  corThresh = 0.2,
  genePlot = FALSE,
  autoThresh = TRUE,
  autoNumWords = FALSE,
  genePathPlot = NULL,
  genePathPlotSig = 0.05,
  tag = "none",
  layout = "nicely",
  edgeLink = TRUE,
  deleteZeroDeg = TRUE,
  enrich = NULL,
  topPath = 10,
  ora = FALSE,
  tagWhole = FALSE,
  mergeCorpus = NULL,
  numOnly = TRUE,
  madeUpperGenes = TRUE,
  onWholeDTM = FALSE,
  pre = TRUE,
  takeMean = FALSE,
  tagPalette = NULL,
  collapse = FALSE,
  addFreqToGene = FALSE,
  useUdpipe = FALSE,
  normalize = FALSE,
  fontFamily = "sans",
  udpipeModel = "english-ewt-ud-2.5-191206.udpipe",
  scaleFreq = NULL,
  colorize = FALSE,
  geneColor = "grey",
  argList = list(),
  useggwordcloud = TRUE,
  wcScale = 10,
  catColors = NULL,
  discreteColorWord = FALSE,
  useSeed = 42,
  scaleEdgeWidth = c(1, 3),
  splitByEA = NULL,
  filterByGO = FALSE,
  docsum = FALSE,
  absolute = TRUE,
  corOption = list()
)

Arguments

geneList

gene ID list

alliancePath

path to The Alliance of Genome Resources gene description file. default to "GENE-DESCRIPTION-TSV_HUMAN.tsv"

keyType

default to SYMBOL

excludeFreq

default to 5000

exclude

"frequency" or "tfidf",

filterMax

Use pre-calculated filter based on max-values when excluding TfIdf Otherwise take sum.

excludeType

">" or "<", combined with 'exclude' and 'excludeFreq', e.g. filter the words with the pre-calculated frequency > 5000

tfidf

use TfIdf when making TDM, default to FALSE.

genePlotNum

number of genes to be plotted (default: 10)

preserve

Try to preserve original characters.

takeMax

Take max values for each term in term-document matrix

additionalRemove

specific words to be excluded

onlyCorpus

return only corpus (tm).

madeUpper

make these words uppercase in resulting plot, default to c("rna" and "dna")

organism

organism ID to use in 'GeneSummary'

pal

palette for color gradient in correlation network. should be a vector of length two like c("red","blue").

numWords

the number of words to be shown in the plot. When 'autoThresh' is TRUE, the number of this value will be shown.

scaleRange

scale for label and node size in the network.

autoScale

scale the label and node size automatically for the large network.

showLegend

whether to show legend in the network

orgDb

the database used to convert identifiers, default to org.Hs.eg.db.

edgeLabel

if TRUE, plot the edge label (default: FALSE)

naEdgeColor

edge colors for NA values (linking query with the category other than text)

cooccurrence

default to FALSE, if TRUE, use cooccurrence instead of correlation.

pvclAlpha

alpha value for pvpick()

cl

for parPvclust, parallel clustering can be performed

ngram

N-gram specification, default to 1.

plotType

"wc" or "network", default to "network"

onlyTDM

return only TDM (tm).

stem

whether to use stemming when making corpus.

colorText

color text label based on frequency in the network

corThresh

the correlation (cooccurrence) threshold.

genePlot

plot associated genes (default: FALSE) This option first calculates the high-frequent words, and subsequently calculates the occurrences of these words in each gene, and prioritize the genes by them.

autoThresh

automatically choose thresholding value to show the 'numWords', when deleteZeroDeg (deleting no-connected words) is TRUE, which is default.

genePathPlot

plot associated genes and pathways (default: NULL) Must be "kegg" or "reactome", automatically set genePlot to TRUE.

genePathPlotSig

threshold for adjusted p-values (default: 0.05)

tag

perform pvclust on words and colorlize them in wordcloud or network argument of those accepted in pvclust 'method.dist' option, like "correlation". Default to "none", which performs no tagging.

layout

the layout for the network, defaul to "nicely". It can be one of the layouts implemented in 'igraph' and 'ggraph', such as 'kk' (Kamada-Kawai), 'nicely' (automatic selection of algorithm), 'drl' (the force-directed DrL layout). The options are available at: https://igraph.org/r/doc/layout_.html

edgeLink

if FALSE, use geom_edge_diagonal. if TRUE, geom_edge_link. Default to TRUE.

deleteZeroDeg

delete zero degree node from plot in the network

enrich

currently, only 'reactome' and 'kegg' is supported.

topPath

how many pathway descriptions are included in text analysis, sorted by p-values in the results.

tagWhole

whether to perform pvclust on whole matrix or subset of the matrix.

mergeCorpus

specify multiple corpus if intend to combine them. like PubMed information and RefSeq summary

numOnly

delete number only (not deleting XXX123, but delete only the number)

madeUpperGenes

make genes upper case automatically (default to TRUE) This uses the 'SYMBOL' key in 'orgDb'.

onWholeDTM

calculate correlation network on whole dataset or top-words specified by numWords.

pre

remove preset filtering words.

takeMean

take mean values for each term in term-document matrix.

tagPalette

tag palette when 'tag' is TRUE. It is also used for dependency network using udpipe, and tagging colorization for word cloud. Default to NULL, which indicates automatically set.

collapse

default to FALSE, collapse all the sentences.

addFreqToGene

add pseudo frequency to gene in genePlot, default to FALSE.

useUdpipe

use udpipe to make a dependency network.

normalize

sum normalize the term frequency document-wise.

fontFamily

font family to use, default to "sans".

udpipeModel

udpipe model file name.

scaleFreq

default to NULL, scale the value if specified

colorize

color the word nodes by their frequency, and the other nodes by their category. if colorize=FALSE and addFreqToGene=TRUE, gene nodes are colorized according to the minimum frequency of the words in the network

geneColor

color for associated genes with words (used when tag or colorize option is TRUE)

argList

parameters to pass to wordcloud() or ggwordcloud()

useggwordcloud

default to TRUE, otherwise use 'wordcloud' function.

wcScale

scaling size for ggwordcloud

catColors

colors for words and texts when colorize is TRUE and discreteColorWord is TRUE

discreteColorWord

colorize words by "Words" category, not frequency.

useSeed

random seed

scaleEdgeWidth

scale for edge width

filterByGO

filter the results to the words obtained from GO terms, while preserving the number of words to be shown

docsum

if TRUE, convert the term-document matrix to binary.

absolute

calculate absolute correlation value

corOption

passed to 'cor' function, like list("method"="kendall")

mbList

microbe ID list

target

"title" or "abstract"

apiKey

api key for eutilities

redo

if plot in other parameters, input the previous list

metab

tibble of metabolite - taxon association

metThresh

threshold of association

curate

include articles in bugsigdb

abstArg

passed to PubMed function when using curate=FALSE

metCol

metabolite data frame column name in the order of "candidate taxon", "metabolite", "quantitative values for thresholding"

ecPlot

plot link between enzyme and microbes this option requires two files to be passed to enzyme() and getUPTax().

mbPlot

plot microbe names

ecFile

enzyme database file

upTaxFile

UniProt taxonomy file

mbColor

color for Microbes when tagPalette or catColors is not specified

disPlot

plot diseases

addFreqToMB

add pseudo frequency to microbes in mbPlot

cache

Caching when BugSigDB is downloaded (default to TRUE)

df

manual document data.frame (must have column 'text') or vector of text. If 'query' column and other columns are present, regards them as category related to the text on the same row.

useFil

filter based on "GS_TfIdf" (whole gene summary tf-idf) or "BSDB_TfIdf" (whole bugsigdb tf-idf)

filType

"above" or "below"

filNum

specify filter tfidf

useQuanteda

use quanteda functions to generate

quantedaArgs

list of arguments to be passed to tokens()

queryColor

color for associated queries with words

queryPlot

plot the query in the graph in relation with the words

addFreqToNonWords

add pseudo-frequency corresponding to minimum frequency of the words to nodes other than words

queries

query ID list

useRawQuery

if you would like to send the query as is, please set this option to TRUE.

sortOrder

sort order, passed to rentrez function

delim

delimiter for queries

quote

whether to quote the queries

retMax

how many items are to be retlieved?

onlyDf

return only the raw data.frame of searching PubMed

perQuery

search for the queries one by one recursively, not using 'delim'.

udpipeOnlyFreq

when using udpipe, include only high-frequent words

udpipeOnlyFreqNB

when using udpipe, include only the neighbors of high-frequent words

addFreqToQuery

add pseudo-frequency to query node

distinguish_query

if TRUE, distinguish query and returned texts by appending (Q) on query

onlyGene

plot only the gene symbol (orgDb with SYMBOL key can be used)

dateRange

if specified, restrict the range of publication date. Must be the two-length vector, like 'c("2013/1/1", "2023/1/1")'

cc0

effective only in 'pubmed' function. The prefetched data in 'pubmedMini' package is used for gene query.

autoNumWords

determine the number of words to be shown by ORA, default to FALSE.

ora

perform over-representation analysis or not (experimental)

splitByEA

automatically split the genes based on significant enrichment analysis results, and returns the list of object for each term. Default to NULL. Must be 'kegg' or 'reactome', in which the function performs over-representation analysis by enrichKEGG or enrichPathway in clusterProfiler and ReactomePA.

Details

The main functions of the biotextgraph package. The functions accepts a character vector of biological entities (such as gene identifiers) and returns the summarized statistics and visualization contained in biotext object.

Value

'biotext' class object

Examples

geneList <- c("DDX41","PNKP","ERCC1","IRF3","XRCC1")
## Not run: alliance(geneList)
mbList <- c("Veillonella dispar")
## Not run: 
    bugsigdb(mbList, plotType="wc")

## End(Not run)
ret <- refseq("DDX41", plotType="wc")
manual(getSlot(ret, "rawText")$Gene_summary, plotType="wc")
## Not run: pubmed("DDX41")
geneList <- c("DDX41","PNKP","ERCC1","IRF3","XRCC1")
refseq(geneList)


noriakis/wcGeneSummary documentation built on April 22, 2024, 7:12 a.m.