update function to remove FASTA headers (line-break terminated substrings that begin with a ">")

This commit is contained in:
hyginn 2017-09-29 12:25:00 -04:00
parent 3b5e65aa8d
commit e1dbd1d268

View File

@ -19,8 +19,10 @@ if (! require("jsonlite", quietly = TRUE)) {
dbSanitizeSequence <- function(s, unambiguous = TRUE) { dbSanitizeSequence <- function(s, unambiguous = TRUE) {
# Flatten any structure that s has, remove all non-letters, convert to # Remove FASTA header lines, if any,
# uppercase. # flatten any structure that s has,
# remove all non-letters,
# convert to uppercase.
# #
# Parameters: # Parameters:
# s chr A DNA or protein sequence plus other characters # s chr A DNA or protein sequence plus other characters
@ -32,7 +34,11 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) {
# to set unambiguous = FALSE to process RNA sequences with Uracil. # to set unambiguous = FALSE to process RNA sequences with Uracil.
# Value: chr a valid, uppercase, amino acid sequence # Value: chr a valid, uppercase, amino acid sequence
# #
s <- paste(unlist(s), collapse="")
s <- as.character(unlist(s)) # convert complex object to plain chr vector
s <- unlist(strsplit(s, "\n")) # split up at linebreaks, if any
s <- s[! grepl("^>", s)] # drop all lines beginning">" (FASTA header)
s <- paste(s, collapse="") # combine into single string
s <- toupper(gsub("[^a-zA-Z]", "", s)) s <- toupper(gsub("[^a-zA-Z]", "", s))
if (unambiguous) { if (unambiguous) {
amb <- "([bjouxzBJOUXZ])" # parentheses capture the match amb <- "([bjouxzBJOUXZ])" # parentheses capture the match