Skip to content
Snippets Groups Projects
Commit a8b44a0f authored by Carlos Vega's avatar Carlos Vega
Browse files

moved file to python package

parent 87ed3826
No related branches found
No related tags found
No related merge requests found
import logging
from typing import List, Set, DefaultDict
from collections import defaultdict
import itertools
import requests
import json
import numpy as np
from textminingservice.TextMiningService import TextMiningService
from textminingservice.models.coocurrence import CoOccurrence
from textminingservice.models.publication import Publication
logger = logging.getLogger(__name__)
class PMC_Europe_Service(TextMiningService):
"""[summary]
Arguments:
TextMiningService {[type]} -- [description]
"""
MAX_PAGE_SIZE = 8
BASE_URL = 'https://www.ebi.ac.uk'
MENTION_URL = BASE_URL + \
'/europepmc/annotations_api/annotationsByEntity?entity={}&filter={}&format={}&cursorMark={}&pageSize={}'
def __init__(self):
super().__init__('PCM Europe',
'This client communicates with PCM Europe API.')
def _get_single_entity_mentions(self, entity: str):
"""
Generator that yields each article and article id that mentions the given entity
See https://europepmc.org/AnnotationsApi#!/annotations45api45controller/getAnnotationsArticlesByEntityUsingGET
The articles come up sorted by number of mentions
"""
prevCursorMark = -1
cursorMark = 0
counter = 0
while cursorMark != prevCursorMark:
url = PMC_Europe_Service.MENTION_URL.format(
entity, 1, 'ID_LIST', cursorMark, PMC_Europe_Service.MAX_PAGE_SIZE)
print(f'Get {counter}')
results = requests.get(url)
assert results.ok
data = json.loads(results.content.decode().strip())
prevCursorMark = cursorMark
cursorMark = data['nextCursorMark']
for article in data['articles']:
counter += 1
yield article, article['extId']
def _incremental_intersection(self, entity: str, white_list: DefaultDict[str, float] = None):
"""Takes the given entity and returns every article with an ID contained in the given white list as well as a new white list with the new ids and recalculated scores.
The score is the normalized position. (1-pos/length) giving more importance to the first article.
The articles are given my PMC Europe sorted by the number of occurrences.
The returned white list will be smaller or equal in size to the given white list.
The scores are added up to the previous ones.
"""
if white_list is None:
white_list = defaultdict(float)
new_article_list = []
new_scores = []
new_ids = []
index = 0
for article, article_id in self._get_single_entity_mentions(entity):
index += 1
# if no white list, keep all, otherwise, only those in white_list
if len(white_list) == 0 or article_id in white_list:
new_article_list.append(article)
new_scores.append(index)
new_ids.append(article_id)
new_scores = 1-np.array(new_scores)/index # normalize position score
new_white_list = dict(zip(new_ids, new_scores))
# update scores of the intersection
for id in new_ids:
new_white_list[id] += white_list[id]
return new_article_list, new_white_list
def get_mentions(self, entities: List[str], limit: int = 20) -> List[Publication]:
"""
This method returns a list of publications sorted by importance.
Since PMC Europe sorts the publications based on the number of occurrences,
this new score could be seen as the degree of co-occurrence.
"""
white_list = None
article_list = []
for entity in entities:
article_list, white_list = self._incremental_intersection(
entity, white_list=white_list)
print(article_list)
# last iteration contains the final intersection
scores = []
publications = []
for article in article_list:
article_id = article['extId']
pub = Publication(pm_id=article_id, pmc_id=article['pmcid'])
publications.append(pub)
scores.append(white_list[article_id])
publications = np.array(publications)
scores = np.array(scores)
inds = scores.argsort()[::-1]
return publications[inds]
def get_co_occurrences(self, entity: str, limit: int = 20, types: List[str] = None) -> List[CoOccurrence]:
pass
if __name__ == "__main__":
pmc = PMC_Europe_Service()
for pub in pmc.get_mentions(['P53', 'PRDM1']):
print(pub)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment