Added unit test for genbank fetching

This commit is contained in:
Harrison Deng 2025-01-02 18:49:23 +00:00
parent 7b079650e0
commit 18a13083dc
8 changed files with 52 additions and 29 deletions

4
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,4 @@
{
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}

View File

@ -38,7 +38,7 @@ addopts = [
asyncio_mode = "auto"
[tool.pylint.main]
source-roots = src
source-roots = "src"
[tool.pylint.format]
# Maximum number of characters on a single line.

View File

@ -1,2 +1,4 @@
requests
biopython
pytest
pytest-asyncio

View File

@ -0,0 +1,34 @@
from typing import Any, Generator, List, Sequence
from Bio.Align import PairwiseAligner
from Bio import Entrez
from Bio import SeqIO
from mlstmyfasta.engine.data.genomics import Strand, StrandFeature, get_feature_coding
async def fetch_ncbi_genbank(genbank_id: str) -> Strand:
with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start = int(feature.location.start)
end = int(feature.location.end)
sequence_features.append(StrandFeature(
type=feature.type,
start=start,
end=end+1, # Position is exclusive
feature_properties=feature.qualifiers
))
return Strand(name=genbank_id, coding=str(record.seq), features=sequence_features)
async def annotate_from_genbank(genbank_id: str, query_coding: str):
strand = await fetch_ncbi_genbank(genbank_id=genbank_id)
aligner = PairwiseAligner("blastn")
aligner.mode = "local"
for feature in strand.features:
feature_coding = get_feature_coding(strand=strand, feature=feature)
if len(aligner.align(query_coding, feature)) < 1:
# TODO implement a failsafe
continue
sequence = sorted(aligner.align(query_coding, feature))[0]

View File

@ -1,24 +0,0 @@
from typing import Any, Generator, List, Sequence
from Bio.Align import PairwiseAligner
from Bio import Entrez
from Bio import SeqIO
from mlstmyfasta.engine.data.genomics import Strand, StrandFeature
async def fetch_ncbi_genbank(genbank_id: str) -> Strand:
with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start, end = feature.location.split("..")
start = int(start)
end = int(end)
feature_properties = dict()
for qualifier in feature.qualifiers:
feature_properties[qualifier.key] = qualifier.value
sequence_features.append(StrandFeature(name=feature.key,
start=start,
end=end,
feature_properties=feature_properties
))
return Strand(name=genbank_id, coding=record.sequence, features=sequence_features)

View File

@ -4,10 +4,10 @@ from typing import Mapping, Sequence
@dataclass
class StrandFeature:
name: str
type: str
start: int
end: int
feature_properties: Mapping[str, str]
feature_properties: Mapping[str, Sequence[str]]
@dataclass
class Strand:
@ -15,4 +15,5 @@ class Strand:
coding: str
features: Sequence[StrandFeature]
def get_feature_coding(strand: Strand, feature: StrandFeature):
strand.coding[feature.start:feature.end]

View File

@ -0,0 +1,6 @@
from mlstmyfasta.engine.annotations import fetch_ncbi_genbank
async def test_fetch_ncbi_genbank_with_id_works():
assert len((await fetch_ncbi_genbank("CP011448.1")).coding) > 0