biokb.py

from textminingservice_biokb.utils import uri_to_entity_code, standardise_underscored_entity_code
from textminingservice.models.publication import Publication
from textminingservice.models.coocurrence import CoOccurrence
from textminingservice.TextMiningService import TextMiningService
from SPARQLWrapper.SPARQLExceptions import QueryBadFormed
from SPARQLWrapper import SPARQLWrapper, JSON, POSTDIRECTLY
from typing import List
import json
import requests
import logging


logger = logging.getLogger(__name__)


class BioKBClientException(Exception):
    pass


class MalformedQueryException(BioKBClientException):
    pass


class BioKBService(TextMiningService):
    SOLR_TRANSLATOR_URL = 'https://biokb.lcsb.uni.lu/api/solr-ids-to-publications'
    # ?solrIds=4b267858-bbde-11e5-9b9d-001a4ae51247&solrIds=593fa4e6-c87e-11e8-ac16-001a4a160176
    SPARQL_URL = 'http://10.240.6.71:8890/sparql'

    def __init__(self):
        self.sparql = SPARQLWrapper(BioKBService.SPARQL_URL)
        self.sparql.setRequestMethod(POSTDIRECTLY)
        super().__init__('BioKB',
                         'This client communicates with BioKB triple store and Publication Solr index.')

    def _run_sparql_query(self, sparql_query):
        try:
            self.sparql.setQuery(sparql_query)
            self.sparql.setReturnFormat(JSON)
            results = self.sparql.query().convert()
            return results
        except QueryBadFormed as e:
            logger.error(e)
            raise MalformedQueryException(e)

    def get_mentions(self, entities: List[str], limit: int = 20) -> List[Publication]:

        entity_subquery = ""
        for entity in entities:
            entity = standardise_underscored_entity_code(entity)
            entity_subquery += f"?publication <http://lcsb.uni.lu/biokb#containsEntity> <http://lcsb.uni.lu/biokb/entities/{entity}> .\n"

        query = """
            select ?publication str(?solrId) as ?solrId where {{
                {}
                ?publication <http://lcsb.uni.lu/biokb#solrId> ?solrId	.
            }} LIMIT {}
            """.format(entity_subquery, limit)

        results = self._run_sparql_query(query)
        solr_ids = set()
        for result in results['results']['bindings']:
            solr_id = result['solrId']['value']
            solr_ids.add(solr_id)
            # pub = Publication(other_id=solr_id)
            # solr_ids[solr_id] = pub

        # translate ids
        response = requests.get(BioKBService.SOLR_TRANSLATOR_URL,
                                data={'solrIds': solr_ids})
        assert response.ok
        data = json.loads(response.content.decode().strip())
        publications = []
        for pub in data['publications']:
            p = Publication(title=pub.get('title', None),
                            journal_title=pub.get('journal_title', None),
                            doi=pub.get('doi', None),
                            pm_id=pub.get('pubmed_id', None),
                            pmc_id=pub.get('pmc_id', None),
                            other_id=pub['id'],
                            year=pub.get('year', None))
            publications.append(p)
        return publications

    def get_co_occurrences(self, entity: str, limit: int = 20, types: List[str] = None) -> List[CoOccurrence]:

        if types is None:
            types = []

        entity_types_filter = ''
        if len(types) > 0:
            types_str = ', '.join((f'<{t}>' for t in types))
            entity_types_filter = f'FILTER (?e_type IN ({types_str}) )'

        entity = standardise_underscored_entity_code(entity)
        query = """
            select * where {
    
                select ?other_entity, (COUNT(*) AS ?count) where {
                    
                    ?s <http://lcsb.uni.lu/biokb#containsEntity> <http://lcsb.uni.lu/biokb/entities/%ENTITY%> .
                    ?s a  <http://lcsb.uni.lu/biokb#Publication> .
                    ?s <http://lcsb.uni.lu/biokb#containsEntity> ?other_entity .
                    ?other_entity a ?e_type .
                    %ENTITY_TYPE_FILTER%
                
                    OPTIONAL {?ss rdfs:subClassOf ?other_entity} .
                
                    FILTER (!bound(?ss)) .
                    FILTER(?other_entity != <http://lcsb.uni.lu/biokb/entities/%ENTITY%>) .
                    
                    OPTIONAL {
                        ?other_entity owl:sameAs ?o_original .
                    } .
                    
                    OPTIONAL {
                        ?other_entity a <http://lcsb.uni.lu/biokb#Protein> .
                        ?other_entity owl:sameAs ?ensembl_protein .
                    }
                }
                
                GROUP BY ?other_entity 
            } ORDER BY DESC(?count) LIMIT %LIMIT%
        """.replace('%ENTITY%', entity).replace('%LIMIT%', str(limit)).replace('%ENTITY_TYPE_FILTER%',
                                                                               entity_types_filter)
        results = self._run_sparql_query(query)
        values = []
        values = []
        for result in results['results']['bindings']:
            entity_code = uri_to_entity_code(result['other_entity']['value'])
            count = int(result['count']['value'])
            co_occur = CoOccurrence(entity_code, count)
            values.append(co_occur)
        return values


if __name__ == "__main__":
    bkb = BioKBService()
    print(bkb.get_co_occurrences('DOID:2841', types=[
        'http://lcsb.uni.lu/biokb#Disease']))
    print('')
    print(bkb.get_co_occurrences('DOID:2841'))
    print('')
    print(bkb.get_mentions(['DOID:2841', 'DOID:1205']))