#' Converts the human proteome, as downloaded from UniProt,
#' into a more convenient data structure to work with: a named
#' list of strings.
#'
#' Also creates a few other files that are helpful for the analysis.
#'
#' Proteome is downloaded from:
#'
#' ftp://ftp.ebi.ac.uk/pub/databases/reference_proteomes/
#' QfO/Eukaryota/UP000005640_9606.fasta.gz
#'
#' Proteome used in Bianchi et al., 2017 used:
#'
#' https://github.com/richelbilderbeek/bianchi_et_al_2017/raw/master/proteome_2017/UP000005640_9606.fasta.gz
#'
#' @inheritParams default_params_doc
#' @param fasta_filename proteome as FASTA file,
#' for example `proteome/UP000005640_9606.fasta.gz`:
#' @author Richèl J.C. Bilderbeek, adapted from Johannes Textor
#' @export
prepare_data <- function(
fasta_filename,
trans_membrane_analysis_filename,
protein_lengths_filename,
proteome_as_data_filename,
tmh_9mers_as_data_filename
) {
testthat::expect_true(file.exists(fasta_filename))
testthat::expect_true(file.exists(trans_membrane_analysis_filename))
x <- seqinr::read.fasta(fasta_filename,
forceDNAtolower = FALSE, as.string = TRUE )
x <- lapply( x, function(x) x[1] )
names( x ) <- sapply( strsplit( names(x), "\\|" ), function(x) x[2] )
# Save protein lengths to file
sink(protein_lengths_filename)
for( i in seq_along(x) ){
cat(names(x)[i]," ",nchar(x[[i]]),"\n",sep="")
}
sink()
testthat::expect_true(file.exists(protein_lengths_filename))
proteome <- x
save(proteome,file = proteome_as_data_filename)
# $A0A024R1R8
# [1] "MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAKVVGKGPLATGGIKKSGKK"
#
# $A0A024RBG1
# [1] "MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQWIVPGGGMEPEEEPGGAAVREVYEEAGVKGKLGRLLGIFEQNQDRKHRTYVYVLTVTEILEDWEDSVNIGRKREWFKVEDAIKVLQCHKPVHAEYLEKLKLGCSPANGNSTVPSLPDNNALFVTAAQTSGLPSSVR"
#
# $A0A075B6H5
# [1] "METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVICKSGTSVNIECRSLDFQATTMFWYRQLRKQSLMLMATSNEGSEVTYEQGVKKDKFPINHPNLTFSALTVTSAHPEDSSFYICSAR"
# generate a list containing the starting position of all 9mers overlapping
# with predicted transmembrane helices
testthat::expect_true(file.exists(trans_membrane_analysis_filename))
x <- utils::read.table(trans_membrane_analysis_filename)
# V1 V2 V3 V4
# 1 A0A075B6K6 inside 1 1
# 2 A0A075B6K6 TMhelix 2 24
# 3 A0A075B6K6 outside 25 122
# 4 A0A075B734 inside 1 40
testthat::expect_true(all(x$V2 %in% c("inside", "TMhelix", "outside")))
# Select only the TMH
x <- x[x$V2 == "TMhelix", c(1,3,4)]
# V1 V3 V4
# 2 A0A075B6K6 2 24
# 5 A0A075B734 41 63
# 7 A0A075B734 68 90
# 9 A0A075B734 124 146
# Convert from (name, from_index, to_index) to a name with the actual
# sequence from [from_index - 8, to_index]
# NOTE: I have no idea why 'from_index - 8' is used
#
#
x <- tapply( 1:nrow(x), list(x$V1),
function(p) do.call( c, lapply( p, function(k) seq(x[k,2] - 8, x[k,3])) ) )
# $A0A075B6K6
# [1] -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
#
# $A0A075B734
# [1] 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
# [24] 56 57 58 59 60 61 62 63 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
# [47] 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 116 117 118 119 120 121 122
# [70] 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
# [93] 146 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
# [116] 170 171 172 173 174 175 176 177 178 191 192 193 194 195 196 197 198 199 200 201 202 203 204
# [139] 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 251 252 253 254 255 256
# [162] 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
# [185] 280 281
#
# $A0A087WTH1
# [1] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
# [24] 47 48 49 50 51 52 53 54 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
# [47] 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
for (protein_name in names(x)) {
if (is.null(proteome[[protein_name]])) {
next
}
testthat::expect_true(!is.null(x[[protein_name]]))
indices <- x[[protein_name]]
protein_sequence <- proteome[[protein_name]]
protein_sequence_length <- nchar(protein_sequence)
x[[protein_name]] <- intersect(
indices,
1:(protein_sequence_length - 8)
)
}
for( i in names(proteome) ){
if( is.null(x[[i]]) ){
x[[i]] <- integer(0)
}
}
tmh.9mers <- x
save(tmh.9mers, file = tmh_9mers_as_data_filename)
# Head:
#
# $A0A075B6K6
# [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
#
# $A0A075B734
# [1] 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
# [24] 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
# [47] 79 80 81 82 83 84 85 86 87 88 89 90 116 117 118 119 120 121 122 123 124 125 126
# [70] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 148 149 150
# [93] 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
# [116] 174 175 176 177 178 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
# [139] 209 210 211 212 213 214 215 216 217 218 219 220 221 251 252 253 254 255 256 257 258 259 260
# [162] 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
#
# $A0A087WTH1
# [1] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
# [24] 47 48 49 50 51 52 53 54 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
# [47] 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.