moved file to python package

a8b44a0f · Carlos Vega · 87ed3826 · a8b44a0f
Commit a8b44a0f authored 5 years ago by Carlos Vega
--- a/pmc_europe/textminingservice_pmc_europe/pmc_europe.py
+++ b/pmc_europe/textminingservice_pmc_europe/pmc_europe.py
+import logging
+from typing import List, Set, DefaultDict
+from collections import defaultdict
+import itertools
+import requests
+import json
+import numpy as np
+from textminingservice.TextMiningService import TextMiningService
+from textminingservice.models.coocurrence import CoOccurrence
+from textminingservice.models.publication import Publication
+logger = logging.getLogger(__name__)
+class PMC_Europe_Service(TextMiningService):
+    """[summary]
+    Arguments:
+        TextMiningService {[type]} -- [description]
+    """
+    MAX_PAGE_SIZE = 8
+    BASE_URL = 'https://www.ebi.ac.uk'
+    MENTION_URL = BASE_URL + \
+        '/europepmc/annotations_api/annotationsByEntity?entity={}&filter={}&format={}&cursorMark={}&pageSize={}'
+    def __init__(self):
+        super().__init__('PCM Europe',
+                         'This client communicates with PCM Europe API.')
+    def _get_single_entity_mentions(self, entity: str):
+        """
+        Generator that yields each article and article id that mentions the given entity
+        See https://europepmc.org/AnnotationsApi#!/annotations45api45controller/getAnnotationsArticlesByEntityUsingGET
+        The articles come up sorted by number of mentions
+        """
+        prevCursorMark = -1
+        cursorMark = 0
+        counter = 0
+        while cursorMark != prevCursorMark:
+            url = PMC_Europe_Service.MENTION_URL.format(
+                entity, 1, 'ID_LIST', cursorMark, PMC_Europe_Service.MAX_PAGE_SIZE)
+            print(f'Get {counter}')
+            results = requests.get(url)
+            assert results.ok
+            data = json.loads(results.content.decode().strip())
+            prevCursorMark = cursorMark
+            cursorMark = data['nextCursorMark']
+            for article in data['articles']:
+                counter += 1
+                yield article, article['extId']
+    def _incremental_intersection(self, entity: str, white_list: DefaultDict[str, float] = None):
+        """Takes the given entity and returns every article with an ID contained in the given white list as well as a new white list with the new ids and recalculated scores.
+        The score is the normalized position. (1-pos/length) giving more importance to the first article.
+        The articles are given my PMC Europe sorted by the number of occurrences.
+        The returned white list will be smaller or equal in size to the given white list.
+        The scores are added up to the previous ones.
+        """
+        if white_list is None:
+            white_list = defaultdict(float)
+        new_article_list = []
+        new_scores = []
+        new_ids = []
+        index = 0
+        for article, article_id in self._get_single_entity_mentions(entity):
+            index += 1
+            # if no white list, keep all, otherwise, only those in white_list
+            if len(white_list) == 0 or article_id in white_list:
+                new_article_list.append(article)
+                new_scores.append(index)
+                new_ids.append(article_id)
+        new_scores = 1-np.array(new_scores)/index  # normalize position score
+        new_white_list = dict(zip(new_ids, new_scores))
+        # update scores of the intersection
+        for id in new_ids:
+            new_white_list[id] += white_list[id]
+        return new_article_list, new_white_list
+    def get_mentions(self, entities: List[str], limit: int = 20) -> List[Publication]:
+        """
+        This method returns a list of publications sorted by importance.
+        Since PMC Europe sorts the publications based on the number of occurrences, 
+         this new score could be seen as the degree of co-occurrence.
+        """
+        white_list = None
+        article_list = []
+        for entity in entities:
+            article_list, white_list = self._incremental_intersection(
+                entity, white_list=white_list)
+            print(article_list)
+        # last iteration contains the final intersection
+        scores = []
+        publications = []
+        for article in article_list:
+            article_id = article['extId']
+            pub = Publication(pm_id=article_id, pmc_id=article['pmcid'])
+            publications.append(pub)
+            scores.append(white_list[article_id])
+        publications = np.array(publications)
+        scores = np.array(scores)
+        inds = scores.argsort()[::-1]
+        return publications[inds]
+    def get_co_occurrences(self, entity: str, limit: int = 20, types: List[str] = None) -> List[CoOccurrence]:
+        pass
+if __name__ == "__main__":
+    pmc = PMC_Europe_Service()
+    for pub in pmc.get_mentions(['P53', 'PRDM1']):
+        print(pub)