Added unit test for genbank fetching
This commit is contained in:
parent
7b079650e0
commit
18a13083dc
4
.vscode/settings.json
vendored
Normal file
4
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
{
|
||||||
|
"python.testing.unittestEnabled": false,
|
||||||
|
"python.testing.pytestEnabled": true
|
||||||
|
}
|
@ -38,7 +38,7 @@ addopts = [
|
|||||||
asyncio_mode = "auto"
|
asyncio_mode = "auto"
|
||||||
|
|
||||||
[tool.pylint.main]
|
[tool.pylint.main]
|
||||||
source-roots = src
|
source-roots = "src"
|
||||||
|
|
||||||
[tool.pylint.format]
|
[tool.pylint.format]
|
||||||
# Maximum number of characters on a single line.
|
# Maximum number of characters on a single line.
|
||||||
|
@ -1,2 +1,4 @@
|
|||||||
requests
|
requests
|
||||||
biopython
|
biopython
|
||||||
|
pytest
|
||||||
|
pytest-asyncio
|
34
src/mlstmyfasta/engine/annotations.py
Normal file
34
src/mlstmyfasta/engine/annotations.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from typing import Any, Generator, List, Sequence
|
||||||
|
from Bio.Align import PairwiseAligner
|
||||||
|
from Bio import Entrez
|
||||||
|
from Bio import SeqIO
|
||||||
|
|
||||||
|
from mlstmyfasta.engine.data.genomics import Strand, StrandFeature, get_feature_coding
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_ncbi_genbank(genbank_id: str) -> Strand:
|
||||||
|
with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream:
|
||||||
|
record = SeqIO.read(fetch_stream, "genbank")
|
||||||
|
sequence_features = list()
|
||||||
|
for feature in record.features:
|
||||||
|
start = int(feature.location.start)
|
||||||
|
end = int(feature.location.end)
|
||||||
|
sequence_features.append(StrandFeature(
|
||||||
|
type=feature.type,
|
||||||
|
start=start,
|
||||||
|
end=end+1, # Position is exclusive
|
||||||
|
feature_properties=feature.qualifiers
|
||||||
|
))
|
||||||
|
return Strand(name=genbank_id, coding=str(record.seq), features=sequence_features)
|
||||||
|
|
||||||
|
|
||||||
|
async def annotate_from_genbank(genbank_id: str, query_coding: str):
|
||||||
|
strand = await fetch_ncbi_genbank(genbank_id=genbank_id)
|
||||||
|
aligner = PairwiseAligner("blastn")
|
||||||
|
aligner.mode = "local"
|
||||||
|
for feature in strand.features:
|
||||||
|
feature_coding = get_feature_coding(strand=strand, feature=feature)
|
||||||
|
if len(aligner.align(query_coding, feature)) < 1:
|
||||||
|
# TODO implement a failsafe
|
||||||
|
continue
|
||||||
|
sequence = sorted(aligner.align(query_coding, feature))[0]
|
@ -1,24 +0,0 @@
|
|||||||
from typing import Any, Generator, List, Sequence
|
|
||||||
from Bio.Align import PairwiseAligner
|
|
||||||
from Bio import Entrez
|
|
||||||
from Bio import SeqIO
|
|
||||||
|
|
||||||
from mlstmyfasta.engine.data.genomics import Strand, StrandFeature
|
|
||||||
|
|
||||||
async def fetch_ncbi_genbank(genbank_id: str) -> Strand:
|
|
||||||
with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream:
|
|
||||||
record = SeqIO.read(fetch_stream, "genbank")
|
|
||||||
sequence_features = list()
|
|
||||||
for feature in record.features:
|
|
||||||
start, end = feature.location.split("..")
|
|
||||||
start = int(start)
|
|
||||||
end = int(end)
|
|
||||||
feature_properties = dict()
|
|
||||||
for qualifier in feature.qualifiers:
|
|
||||||
feature_properties[qualifier.key] = qualifier.value
|
|
||||||
sequence_features.append(StrandFeature(name=feature.key,
|
|
||||||
start=start,
|
|
||||||
end=end,
|
|
||||||
feature_properties=feature_properties
|
|
||||||
))
|
|
||||||
return Strand(name=genbank_id, coding=record.sequence, features=sequence_features)
|
|
@ -4,10 +4,10 @@ from typing import Mapping, Sequence
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class StrandFeature:
|
class StrandFeature:
|
||||||
name: str
|
type: str
|
||||||
start: int
|
start: int
|
||||||
end: int
|
end: int
|
||||||
feature_properties: Mapping[str, str]
|
feature_properties: Mapping[str, Sequence[str]]
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Strand:
|
class Strand:
|
||||||
@ -15,4 +15,5 @@ class Strand:
|
|||||||
coding: str
|
coding: str
|
||||||
features: Sequence[StrandFeature]
|
features: Sequence[StrandFeature]
|
||||||
|
|
||||||
|
def get_feature_coding(strand: Strand, feature: StrandFeature):
|
||||||
|
strand.coding[feature.start:feature.end]
|
6
tests/mlstmyfasta/engine/test_annotations.py
Normal file
6
tests/mlstmyfasta/engine/test_annotations.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from mlstmyfasta.engine.annotations import fetch_ncbi_genbank
|
||||||
|
|
||||||
|
|
||||||
|
async def test_fetch_ncbi_genbank_with_id_works():
|
||||||
|
assert len((await fetch_ncbi_genbank("CP011448.1")).coding) > 0
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user