From a8b44a0fd6a0d43764c74237084ce4b06c21e229 Mon Sep 17 00:00:00 2001 From: Carlos Vega <carlos.vega@uni.lu> Date: Wed, 20 Nov 2019 09:44:38 +0100 Subject: [PATCH] moved file to python package --- .../pmc_europe.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 pmc_europe/textminingservice_pmc_europe/pmc_europe.py diff --git a/pmc_europe/textminingservice_pmc_europe/pmc_europe.py b/pmc_europe/textminingservice_pmc_europe/pmc_europe.py new file mode 100644 index 0000000..4f3ff16 --- /dev/null +++ b/pmc_europe/textminingservice_pmc_europe/pmc_europe.py @@ -0,0 +1,121 @@ +import logging +from typing import List, Set, DefaultDict +from collections import defaultdict + +import itertools +import requests +import json +import numpy as np + +from textminingservice.TextMiningService import TextMiningService +from textminingservice.models.coocurrence import CoOccurrence +from textminingservice.models.publication import Publication + +logger = logging.getLogger(__name__) + + +class PMC_Europe_Service(TextMiningService): + """[summary] + + Arguments: + TextMiningService {[type]} -- [description] + """ + MAX_PAGE_SIZE = 8 + BASE_URL = 'https://www.ebi.ac.uk' + MENTION_URL = BASE_URL + \ + '/europepmc/annotations_api/annotationsByEntity?entity={}&filter={}&format={}&cursorMark={}&pageSize={}' + + def __init__(self): + super().__init__('PCM Europe', + 'This client communicates with PCM Europe API.') + + def _get_single_entity_mentions(self, entity: str): + """ + Generator that yields each article and article id that mentions the given entity + + See https://europepmc.org/AnnotationsApi#!/annotations45api45controller/getAnnotationsArticlesByEntityUsingGET + + The articles come up sorted by number of mentions + """ + prevCursorMark = -1 + cursorMark = 0 + counter = 0 + while cursorMark != prevCursorMark: + url = PMC_Europe_Service.MENTION_URL.format( + entity, 1, 'ID_LIST', cursorMark, PMC_Europe_Service.MAX_PAGE_SIZE) + print(f'Get {counter}') + results = requests.get(url) + assert results.ok + data = json.loads(results.content.decode().strip()) + prevCursorMark = cursorMark + cursorMark = data['nextCursorMark'] + for article in data['articles']: + counter += 1 + yield article, article['extId'] + + def _incremental_intersection(self, entity: str, white_list: DefaultDict[str, float] = None): + """Takes the given entity and returns every article with an ID contained in the given white list as well as a new white list with the new ids and recalculated scores. + The score is the normalized position. (1-pos/length) giving more importance to the first article. + The articles are given my PMC Europe sorted by the number of occurrences. + + The returned white list will be smaller or equal in size to the given white list. + The scores are added up to the previous ones. + """ + + if white_list is None: + white_list = defaultdict(float) + + new_article_list = [] + new_scores = [] + new_ids = [] + index = 0 + for article, article_id in self._get_single_entity_mentions(entity): + index += 1 + # if no white list, keep all, otherwise, only those in white_list + if len(white_list) == 0 or article_id in white_list: + new_article_list.append(article) + new_scores.append(index) + new_ids.append(article_id) + + new_scores = 1-np.array(new_scores)/index # normalize position score + new_white_list = dict(zip(new_ids, new_scores)) + # update scores of the intersection + for id in new_ids: + new_white_list[id] += white_list[id] + + return new_article_list, new_white_list + + def get_mentions(self, entities: List[str], limit: int = 20) -> List[Publication]: + """ + This method returns a list of publications sorted by importance. + Since PMC Europe sorts the publications based on the number of occurrences, + this new score could be seen as the degree of co-occurrence. + """ + white_list = None + article_list = [] + for entity in entities: + article_list, white_list = self._incremental_intersection( + entity, white_list=white_list) + print(article_list) + # last iteration contains the final intersection + scores = [] + publications = [] + for article in article_list: + article_id = article['extId'] + pub = Publication(pm_id=article_id, pmc_id=article['pmcid']) + publications.append(pub) + scores.append(white_list[article_id]) + + publications = np.array(publications) + scores = np.array(scores) + inds = scores.argsort()[::-1] + return publications[inds] + + def get_co_occurrences(self, entity: str, limit: int = 20, types: List[str] = None) -> List[CoOccurrence]: + pass + + +if __name__ == "__main__": + pmc = PMC_Europe_Service() + for pub in pmc.get_mentions(['P53', 'PRDM1']): + print(pub) -- GitLab