diff --git a/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java b/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java index de70f2f49512d700cc2a4348d59833f1168a960d..8f85a7d9573e011559496bfdb434ee0578a33129 100644 --- a/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java +++ b/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java @@ -39,723 +39,733 @@ import lcsb.mapviewer.model.map.MiriamType; */ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalService { - /** - * Maximum available length of the pubmed identifiers. - */ - private static final int MAX_PUBMED_IDENTIFIER_LENGTH = 9; - - /** - * Prefix used for storing data about drug with name key value. - */ - static final String DRUG_NAME_PREFIX = "drug:"; - - /** - * Service used for annotation of proteins using {@link MiriamType#UNIPROT - * uniprot}. - */ - @Autowired - private UniprotAnnotator uniprotAnnotator; - - /** - * Pattern that extract information about drug name when accessing page about - * target. - */ - private final Pattern drugNamePattern = Pattern.compile("(?<=<td><strong>)([\\s\\S]*?)(?=</strong></td>)"); - - /** - * Default class logger. - */ - private Logger logger = Logger.getLogger(DrugbankHTMLParser.class); - - /** - * Url used for finding drug by drug name. - */ - static final String URLB = "https://www.drugbank.ca/search?searcher=drugs&utf8=%E2%9C%93&button=&filter=false&query="; - - /** - * Url used for retrieving general information about drug. - */ - static final String URLDRUG = "https://www.drugbank.ca/drugs/"; - - /** - * Homepage of drugbank. - */ - static final String URL = "https://www.drugbank.ca/"; - - /** - * Url used for accesing information about target synonyms. - */ - static final String URLPEP = "https://www.drugbank.ca/biodb/polypeptides/"; - - /** - * Url that allows to search for dugs using target hgnc name. - */ - static final String URL_TARGETS = "https://www.drugbank.ca/search?utf8=%E2%9C%93&searcher=targets&query="; - - /** - * Url that helps finding drug name for given target identifier. - */ - static final String URL_TARGET_DETAIL = "https://www.drugbank.ca/biodb/bio_entities/"; - - /** - * Pattern used to get information about {@link Drug#bloodBrainBarrier blood - * brain barrier}. - */ - private Pattern bloodBrainBarrierPattern = Pattern.compile("(?<=<td>)([\\s\\S]*?)(?=</td>)"); - - /** - * Pattern that extract information about target identifier when searching for - * targets with given set of hgnc names. - */ - private Pattern targetPattern = Pattern.compile("(?<=\"/biodb/bio_entities/)([\\s\\S]*?)(?=\")"); - - /** - * Default constructor. - */ - public DrugbankHTMLParser() { - super(DrugbankHTMLParser.class); - } - - @Override - public String refreshCacheQuery(Object query) throws SourceNotAvailable { - String result = null; - try { - if (query instanceof String) { - String name = (String) query; - if (name.startsWith(DRUG_NAME_PREFIX)) { - name = name.substring(DRUG_NAME_PREFIX.length()); - result = getDrugSerializer().objectToString(findDrug(name)); - } else if (name.startsWith("http")) { - result = getWebPageContent(name); - } else { - throw new InvalidArgumentException("Don't know what to do with string \"" + query + "\""); - } - } else { - throw new InvalidArgumentException("Don't know what to do with class: " + query.getClass()); - } - } catch (DrugSearchException e) { - throw new SourceNotAvailable(e); - } catch (IOException e) { - throw new SourceNotAvailable(e); - } - return result; - } - - /** - * Finds the drug identifier in the webpage with information about drug. - * - * @param webpageContent - * content of the webpage with information about drug - * @return drugbank identifier - */ - private String findIdInText(String webpageContent) { - int i = 0, j = 0; - i = webpageContent.indexOf("href=\"/drugs/"); - if (i == -1) { - return null; - } - i = i + "href=\"/drugs/".length(); - j = webpageContent.indexOf('"', i); - return webpageContent.substring(i, j); - } - - /** - * Finds the drug name in the webpage with information about drug. - * - * @param webpageContent - * content of the webpage with information about drug - * @return name of the drug - */ - private String findNameInText(String webpageContent) { - int i = 0, j = 0; - i = webpageContent.indexOf("a href=\"/drugs/"); - i = webpageContent.indexOf(">", i); - i = i + 1; - j = webpageContent.indexOf('<', i); - return webpageContent.substring(i, j); - } - - /** - * Finds the description of the drug on the conetnet of the webpage with - * information about drug. - * - * @param webpageContent - * content of the webpage with information about drug - * @return description of the drug - */ - String getDescriptionForDrug(String webpageContent) { - int startIndex = 0, endIndex = 0; - startIndex = webpageContent.indexOf("<th>Description</th><td>"); - if (startIndex == -1) { - return null; - } - startIndex = startIndex + "<th>Description</th><td>".length(); - endIndex = webpageContent.indexOf("</td", startIndex); - String description = webpageContent.substring(startIndex, endIndex); - // find special html characters - description = StringEscapeUtils.unescapeHtml4(description); - description = cleanHtml(description); - if (description.equalsIgnoreCase("Not Available")) { - description = null; - } - - return description; - } - - /** - * Returns list of the drug brand names from the webpage content. - * - * @param page - * content of the webpage with information about drug - * @return list of the drug brand names - */ - private List<String> getTargetBrands(String page) { - List<String> result = new ArrayList<String>(); - Pattern brandNamesParagraphPattern = Pattern.compile("(?<=<th>International Brands</th>)([\\s\\S]*?)(?=<th>Brand mixtures</th>)"); - Matcher matcher = brandNamesParagraphPattern.matcher(page); - - if (matcher.find()) { - String paragraph = matcher.group(1); - Pattern listElementPattern = Pattern.compile("(?<=<tr><td>)([\\s\\S]*?)(?=</td>)"); - Matcher elementMatcher = listElementPattern.matcher(paragraph); - while (elementMatcher.find()) { - result.add(elementMatcher.group(1)); - } - } - return result; - } - - /** - * Returns list of drug synonyms from drugbank database. - * - * @param page - * webpage content with drug information - * @return list of drug synonyms - */ - private List<String> getDrugSynonyms(String page) { - List<String> ans = new ArrayList<String>(); - - int i, j, end; - Boolean finding = true; - i = page.indexOf("Synonyms"); - end = page.indexOf("Prescription Products", i); - while (finding) { - i = page.indexOf("<tr><td>", i); - if (i == -1 || i > end) { - finding = false; - continue; - } - i = i + "<tr><td>".length(); - j = page.indexOf("</td>", i); - String synonym = StringEscapeUtils.unescapeHtml4(page.substring(i, j)); - ans.add(cleanHtml(synonym)); - } - - return ans; - } - - /** - * Returns references from reference string (obtained from drugbank homepage) - * . - * - * @param descriptionString - * string containing references obtained from drugbank webpage - * @return list of references obtained from drugbank {@link String} - */ - List<MiriamData> getPubmedFromRef(String descriptionString) { - List<MiriamData> result = new ArrayList<>(); - int currentStartIndex = 0; - while (true) { - currentStartIndex = descriptionString.indexOf("/pubmed/", currentStartIndex); - if (currentStartIndex == -1) { - break; - } - currentStartIndex = currentStartIndex + "/pubmed/".length(); - int endIndex = descriptionString.indexOf('"', currentStartIndex); - String identifier = descriptionString.substring(currentStartIndex, endIndex); - String correctedIdentifier = identifier.replaceAll("[^\\d]", ""); - if (!identifier.equals(correctedIdentifier)) { - if (correctedIdentifier.length() > MAX_PUBMED_IDENTIFIER_LENGTH) { - correctedIdentifier = correctedIdentifier.substring(0, MAX_PUBMED_IDENTIFIER_LENGTH); - } - logger.warn("Problematic pubmed identfier: \"" + identifier + "\". Trying fixing with the result: " + correctedIdentifier); - } - if (correctedIdentifier.equals("")) { - break; - } - result.add(new MiriamData(MiriamRelationType.BQ_BIOL_IS_DESCRIBED_BY, MiriamType.PUBMED, correctedIdentifier)); - } - return result; - } - - /** - * Returns targets parsed from page content. - * - * @param page - * webpage content from which data are extracted - * @throws DrugSearchException - * thrown when there are problems with connection to DrugBank - * database - * @return list of drug {@link Target} parsed from input string - */ - List<Target> getTargetsForDrug(String page) throws DrugSearchException { - List<Target> result = new ArrayList<>(); - try { - int pageStart; - Target target = new Target(); - target.setType(TargetType.SINGLE_PROTEIN); - - pageStart = page.indexOf("<div class=\"target well well"); - - int end = page.indexOf("<div class=\"enzyme well well"); - if (end < 0) { - end = page.indexOf("<div class=\"carrier well well"); - if (end < 0) { - end = page.indexOf("<div class=\"transporter well well"); - - if (end < 0) { - end = page.indexOf("<tr id=\"comments\">"); - if (end < 0) { - end = page.length() - 1; - } - } - } - } - - pageStart = page.indexOf("Details</a></div>", pageStart); - while (pageStart > 0 && pageStart < end) { - int targetStart = page.indexOf("</div><strong>", pageStart); - int nextTargetStart = page.indexOf("</div><strong>", targetStart + 1); - if (nextTargetStart < 0) { - nextTargetStart = end; - } - - target = parseTarget(page.substring(targetStart, nextTargetStart)); - if (target != null) { - result.add(target); - } - pageStart = nextTargetStart; - - } - } catch (TaxonomySearchException e) { - throw new DrugSearchException("Problem with finidng information about organism", e); - } catch (UniprotSearchException e) { - throw new DrugSearchException("Problem with finidng information about protein", e); - } - return result; - } - - /** - * Parse html info about target into {@link Target} structure. - * - * @param htmlPage - * string with html content - * @return {@link Target} for given html content - * @throws UniprotSearchException - * thrown when there is a problem with accessing uniprot db - * @throws TaxonomySearchException - * thrown when there is a problem with accessing taxonomy db - */ - protected Target parseTarget(String htmlPage) throws UniprotSearchException, TaxonomySearchException { - int kindIndex = htmlPage.indexOf("<dt>Kind</dt><dd>"); - kindIndex += "<dt>Kind</dt><dd>".length(); - int endKindIndex = htmlPage.indexOf("</dd>"); - String type = ""; - if (kindIndex > 0 && endKindIndex > kindIndex) { - type = htmlPage.substring(kindIndex, endKindIndex); - } - if (type.trim().equalsIgnoreCase("Protein")) { - int uniprotIdStart = htmlPage.indexOf("/biodb/polypeptides/") + "/biodb/polypeptides/".length(); - Target result = new Target(); - result.setType(TargetType.SINGLE_PROTEIN); - - // Getting ID && Name - int uniprotIdEnd = htmlPage.indexOf('"', uniprotIdStart); - String uniprotId = htmlPage.substring(uniprotIdStart, uniprotIdEnd); - MiriamData uniprotTarget = new MiriamData(MiriamType.UNIPROT, uniprotId); - MiriamData hgncTarget = uniprotAnnotator.uniProtToHgnc(uniprotTarget); - if (hgncTarget != null) { - result.addGene(hgncTarget); - } else { - result.addGene(uniprotTarget); - } - - int nameStart = uniprotIdEnd + 2; - int nameEnd = htmlPage.indexOf("</", uniprotIdStart); - String name = StringEscapeUtils.unescapeHtml4(htmlPage.substring(nameStart, nameEnd)); - result.setName(name); - - // Getting Organism - int organismStart = htmlPage.indexOf("Organism</dt><dd>", nameEnd) + "Organism</dt><dd>".length(); - int organismEnd = htmlPage.indexOf("</dd>", organismStart); - result.setOrganism(getTaxonomyBackend().getByName(htmlPage.substring(organismStart, organismEnd))); - - // Getting References - int referencesStart = htmlPage.indexOf("<strong>References</strong>", organismEnd); - if (referencesStart > 0) { - int referencesEnd = Math.min(htmlPage.indexOf("Details</a></div>", referencesStart), htmlPage.length()); - if (referencesEnd < 0) { - referencesEnd = htmlPage.length(); - } - result.addReferences(getPubmedFromRef(htmlPage.substring(referencesStart, referencesEnd))); - } - return result; - } else { - logger.warn("Unknown target type: " + type + ". Skipping."); - return null; - } - } - - /** - * Finds information about drug in drugbank database. - * - * @param name - * name of the dug that we are looking for - * @throws DrugSearchException - * thrown when there are problems with connection to DrugBank - * database - * @return drug with the information abtained from drugbank or - * <code>null</code> if such data couldn't be found - */ - private Drug findMoreInformation(String name) throws DrugSearchException { - Drug result = null; - try { - name = URLEncoder.encode(name, "UTF-8"); - - String accessUrl = URLB + name; - - String content = getWebPageContent(accessUrl); - - String[] lines = content.split("\n"); - for (String inputLine : lines) { - String tmp = findIdInText(inputLine); - if (tmp != null) { - result = new Drug(); - result.addSource(new MiriamData(MiriamRelationType.BQ_BIOL_IS_DESCRIBED_BY, MiriamType.DRUGBANK, tmp)); - result.setName(super.cleanHtml(findNameInText(inputLine))); - break; - } - } - - if (result != null) { - - accessUrl = URLDRUG + result.getSources().get(0).getResource(); - - String page = getWebPageContent(accessUrl); - - result.setDescription(getDescriptionForDrug(page)); - - result.setSynonyms(getDrugSynonyms(page)); - - // Getting drug brand names - result.setBrandNames(getTargetBrands(page)); - - result.addTargets(getTargetsForDrug(page)); - - result.setBloodBrainBarrier(getBloodBrainBarrier(page)); - - result.setApproved(getApproved(page)); - - if (!nameMatch(result, name)) { - result = null; - } - } - } catch (IOException e) { - throw new DrugSearchException(e); - } - return result; - } - - /** - * Check if drug data mateches with the search name. - * - * @param drug - * drug to be checked - * @param name - * name of the drug that we were looking for - * @return true if if drug data matches with the search name - */ - private boolean nameMatch(Drug drug, String name) { - Set<String> foundNames = new HashSet<>(); - foundNames.add(drug.getName()); - foundNames.addAll(drug.getSynonyms()); - foundNames.addAll(drug.getBrandNames()); - String lowerCaseName; - try { - lowerCaseName = java.net.URLDecoder.decode(name, "UTF-8").toLowerCase().replaceAll("[^A-Za-z0-9]", ""); - } catch (UnsupportedEncodingException e) { - lowerCaseName = name.toLowerCase().replaceAll("[^A-Za-z0-9]", ""); - } - for (String string : foundNames) { - String query = string.toLowerCase().replaceAll("[^A-Za-z0-9]", ""); - if (query.contains(lowerCaseName)) { - return true; - } - } - return false; - } - - /** - * Finds blood brain barrier info about drug in the webpage content. - * - * @param page - * webpage content used for parsing - * - * @return status of blood brain barrier - */ - String getBloodBrainBarrier(String page) { - int index = page.indexOf("<td>Blood Brain Barrier</td>"); - if (index <= 0) { - return "N/A"; - } - - Matcher matcher = bloodBrainBarrierPattern.matcher(page.substring(index)); - - matcher.find(); - if (!matcher.find()) { - logger.warn("Invalid html for Blood Brain Barrier..."); - return "N/A"; - } - String match = matcher.group(1); - if (match.contains("+")) { - return "YES"; - } else if (match.contains("-")) { - return "NO"; - } else { - logger.warn("Unknown Blood Brain Barrier status: " + match); - return "N/A"; - } - } - - /** - * Finds if drug is approved. - * - * @param page - * webpage content used for parsing - * - * @return <code>true</code> if drug is approved - */ - private Boolean getApproved(String page) { - int index = page.indexOf("<th>Groups</th>"); - if (index <= 0) { - return null; - } - - int end = page.indexOf("<th>", index + 1); - - if (end < index) { - end = page.length(); - } - String match = page.substring(index, end).toLowerCase(); - if (match.contains("approved")) { - if (match.contains("investigational") || match.contains("withdrawn") || match.contains("experimental")) { - logger.warn("Contradicting info about approved status: " + match); - return null; - } - return true; - } else if (match.contains("investigational") || match.contains("withdrawn") || match.contains("experimental")) { - return false; - } else { - logger.warn("Unknown approved status: " + match); - return null; - } - } - - @Override - public Drug findDrug(String drugName) throws DrugSearchException { - String query = DRUG_NAME_PREFIX + drugName; - Drug drug = null; - try { - drug = getDrugSerializer().xmlToObject(getCacheNode(query)); - } catch (SerializationException e) { - logger.error("Problem with deserializing element by query: " + query); - } - if (drug != null) { - return drug; - } - - drug = findMoreInformation(drugName); - - try { - setCacheValue(query, getDrugSerializer().objectToString(drug)); - } catch (SerializationException e) { - logger.error("Problem with serializing element "); - } - - return drug; - } - - @Override - public ExternalServiceStatus getServiceStatus() { - ExternalServiceStatus status = new ExternalServiceStatus("DrugBank", URL); - - GeneralCacheInterface cacheCopy = getCache(); - this.setCache(null); - - try { - Drug drug = findDrug("Amantadine"); - status.setStatus(ExternalServiceStatusType.OK); - if (drug == null) { - status.setStatus(ExternalServiceStatusType.CHANGED); - } - } catch (Exception e) { - logger.error("DrugBank is down", e); - status.setStatus(ExternalServiceStatusType.DOWN); - } - this.setCache(cacheCopy); - return status; - } - - @Override - public List<Drug> getDrugListByTarget(MiriamData targetMiriamData, Collection<MiriamData> organisms) throws DrugSearchException { - List<Drug> result = new ArrayList<>(); - if (targetMiriamData == null) { - return result; - } - if (!(MiriamType.HGNC_SYMBOL.equals(targetMiriamData.getDataType()))) { - throw new InvalidArgumentException("Only " + MiriamType.HGNC_SYMBOL + " type is accepted"); - } - String url = URL_TARGETS + targetMiriamData.getResource(); - - try { - String page = getWebPageContent(url); - - Set<String> drugNames = new HashSet<>(); - - Matcher matcher = targetPattern.matcher(page); - while (matcher.find()) { - String drugbankTargetId = matcher.group(0); - drugNames.addAll(getDrugNamesForTarget(new MiriamData(MiriamType.DRUGBANK_TARGET_V4, drugbankTargetId), targetMiriamData, organisms)); - } - for (String string : drugNames) { - Drug drug = findDrug(string); - if (drug == null) { - logger.warn("Cannot find drug that should be there: " + string); - } else { - boolean targets = false; - for (Target target : drug.getTargets()) { - for (MiriamData gene : target.getGenes()) { - if (gene.equals(targetMiriamData)) { - targets = true; - } - } - } - if (targets) { - result.add(drug); - } else { - logger.debug("Skipping drug that doesn't target required target. Drug name: " + drug.getName() + "; target: " + targetMiriamData); - } - } - } - - return result; - } catch (IOException e) { - throw new DrugSearchException("Cannot access drug database", e); - } - } - - /** - * Returns list of drugs that target element (target) identified be drugbank - * identifier. - * - * @param drugbankTarget - * {@link MiriamType#DRUGBANK_TARGET_V4 identifier} of the target - * @param hgncTarget - * identifier of the target using {@link MiriamType#HGNC_SYMBOL}, - * used for verification if the target really points to proper - * protein/gene - * @param organisms - * list of organisms to which results should be limited (when no - * organisms defined filtering will be turned off) - * @return list of drugs that target this target - * @throws DrugSearchException - * thrown when there are problems with connection to DrugBank - * database - */ - private Set<String> getDrugNamesForTarget(MiriamData drugbankTarget, MiriamData hgncTarget, Collection<MiriamData> organisms) throws DrugSearchException { - if (!MiriamType.DRUGBANK_TARGET_V4.equals(drugbankTarget.getDataType())) { - throw new InvalidArgumentException("drugbankTarget must be of type: " + MiriamType.DRUGBANK_TARGET_V4); - } - - try { - Set<String> drugNames = new HashSet<>(); - String url = URL_TARGET_DETAIL + drugbankTarget.getResource(); - - String page = getWebPageContent(url); - - int idPosition = page.indexOf("<th>DrugBank ID</th>"); - if (idPosition < 0) { - throw new DrugSearchException("Problematic web page for target: " + drugbankTarget + "(" + hgncTarget + ")"); - } - - int protienLinkPosition = page.indexOf("/polypeptides/"); - // sometimes there might not be an element - if (protienLinkPosition >= 0) { - protienLinkPosition = protienLinkPosition + "/polypeptides/".length(); // 20; - int j = page.indexOf('"', protienLinkPosition); - String uniprotId = page.substring(protienLinkPosition, j); - MiriamData uniprotMiriam = new MiriamData(MiriamType.UNIPROT, uniprotId); - MiriamData hgncMiriam = uniprotAnnotator.uniProtToHgnc(uniprotMiriam); - if (hgncMiriam == null || !hgncMiriam.equals(hgncTarget)) { - logger.debug("Invalid target found. Expected " + hgncTarget + ", but found: " + hgncMiriam + " (" + uniprotMiriam + ")"); - return drugNames; - } - - } else { - logger.warn("Invalid target found. No protein data available."); - return drugNames; - } - - int organismPosition = page.indexOf("Organism<"); - if (organismPosition >= 0) { - int organismStart = page.indexOf("<td>", organismPosition) + "<td>".length(); - int organismEnd = page.indexOf("<", organismStart + 1); - String organismName = page.substring(organismStart, organismEnd); - if (!organismMatch(organismName, organisms)) { - logger.debug("Organism doesn't match. Found" + organismName + ". Expected: " + organisms); - return drugNames; - } - } - - Matcher matcher = drugNamePattern.matcher(page.substring(idPosition)); - - while (matcher.find()) { - drugNames.add(matcher.group(1)); - } - - return drugNames; - } catch (IOException e) { - throw new DrugSearchException("Problem with accessing drugbank db", e); - } catch (UniprotSearchException e) { - throw new DrugSearchException("Problem with uniprot annotations", e); - } - } - - /** - * @return the uniprotAnnotator - * @see #uniprotAnnotator - */ - public UniprotAnnotator getUniprotAnnotator() { - return uniprotAnnotator; - } - - /** - * @param uniprotAnnotator - * the uniprotAnnotator to set - * @see #uniprotAnnotator - */ - public void setUniprotAnnotator(UniprotAnnotator uniprotAnnotator) { - this.uniprotAnnotator = uniprotAnnotator; - } - - @Override - protected WebPageDownloader getWebPageDownloader() { - return super.getWebPageDownloader(); - } - - @Override - protected void setWebPageDownloader(WebPageDownloader webPageDownloader) { - super.setWebPageDownloader(webPageDownloader); - } - -} + /** + * Maximum available length of the pubmed identifiers. + */ + private static final int MAX_PUBMED_IDENTIFIER_LENGTH = 9; + + /** + * Prefix used for storing data about drug with name key value. + */ + static final String DRUG_NAME_PREFIX = "drug:"; + + /** + * Service used for annotation of proteins using {@link MiriamType#UNIPROT + * uniprot}. + */ + @Autowired + private UniprotAnnotator uniprotAnnotator; + + /** + * Pattern that extract information about drug name when accessing page about + * target. + */ + private final Pattern drugNamePattern = Pattern.compile("(?<=<td><strong>)([\\s\\S]*?)(?=</strong></td>)"); + + /** + * Default class logger. + */ + private Logger logger = Logger.getLogger(DrugbankHTMLParser.class); + + /** + * Url used for finding drug by drug name. + */ + static final String URLB = "https://www.drugbank.ca/search?searcher=drugs&utf8=%E2%9C%93&button=&filter=false&query="; + + /** + * Url used for retrieving general information about drug. + */ + static final String URLDRUG = "https://www.drugbank.ca/drugs/"; + + /** + * Homepage of drugbank. + */ + static final String URL = "https://www.drugbank.ca/"; + + /** + * Url used for accessing information about target synonyms. + */ + static final String URLPEP = "https://www.drugbank.ca/biodb/polypeptides/"; + + /** + * Url that allows to search for drugs using target hgnc name. + */ + static final String URL_TARGETS = "https://www.drugbank.ca/search?utf8=%E2%9C%93&searcher=targets&query="; + + /** + * Url that helps finding drug name for given target identifier. + */ + static final String URL_TARGET_DETAIL = "https://www.drugbank.ca/biodb/bio_entities/"; + + /** + * Pattern used to get information about {@link Drug#bloodBrainBarrier blood + * brain barrier}. + */ + private Pattern bloodBrainBarrierPattern = Pattern.compile("(?<=<td>)([\\s\\S]*?)(?=</td>)"); + + /** + * Pattern that extract information about target identifier when searching for + * targets with given set of HGNC names. + */ + private Pattern targetPattern = Pattern.compile("(?<=\"/biodb/bio_entities/)([\\s\\S]*?)(?=\")"); + + /** + * Default constructor. + */ + public DrugbankHTMLParser() { + super(DrugbankHTMLParser.class); + } + + @Override + public String refreshCacheQuery(Object query) throws SourceNotAvailable { + String result = null; + try { + if (query instanceof String) { + String name = (String) query; + if (name.startsWith(DRUG_NAME_PREFIX)) { + name = name.substring(DRUG_NAME_PREFIX.length()); + result = getDrugSerializer().objectToString(findDrug(name)); + } else if (name.startsWith("http")) { + result = getWebPageContent(name); + } else { + throw new InvalidArgumentException("Don't know what to do with string \"" + query + "\""); + } + } else { + throw new InvalidArgumentException("Don't know what to do with class: " + query.getClass()); + } + } catch (DrugSearchException e) { + throw new SourceNotAvailable(e); + } catch (IOException e) { + throw new SourceNotAvailable(e); + } + return result; + } + /** + * Finds the drug identifier in the web page with information about drug. + * + * @param webpageContent + * content of the web page with information about drug + * @return drugbank identifier + */ + private String findIdInText(String webpageContent) { + int i = 0, j = 0; + i = webpageContent.indexOf("href=\"/drugs/"); + if (i == -1) { + return null; + } + i = i + "href=\"/drugs/".length(); + j = webpageContent.indexOf('"', i); + return webpageContent.substring(i, j); + } + + /** + * Finds the drug name in the webpage with information about drug. + * + * @param webpageContent + * content of the webpage with information about drug + * @return name of the drug + */ + private String findNameInText(String webpageContent) { + int i = 0, j = 0; + i = webpageContent.indexOf("a href=\"/drugs/"); + i = webpageContent.indexOf(">", i); + i = i + 1; + j = webpageContent.indexOf('<', i); + return webpageContent.substring(i, j); + } + + /** + * Finds the description of the drug on the content of the web page with + * information about drug. + * + * @param webpageContent + * content of the web page with information about drug + * @return description of the drug + */ + String getDescriptionForDrug(String webpageContent) { + int startIndex = 0, endIndex = 0; + startIndex = webpageContent.indexOf("Description</dt><dd class=\"col-md-10 col-sm-8\">"); + if (startIndex == -1) { + return null; + } + startIndex = startIndex + "Description</dt><dd class=\"col-md-10 col-sm-8\">".length(); + endIndex = webpageContent.indexOf("</dd", startIndex); + String description = webpageContent.substring(startIndex, endIndex); + // find special html characters + description = StringEscapeUtils.unescapeHtml4(description); + description = cleanHtml(description); + if (description.equalsIgnoreCase("Not Available")) { + description = null; + } + + return description; + } + + /** + * Returns list of the drug brand names from the web page content. + * + * @param page + * content of the web page with information about drug + * @return list of the drug brand names + */ + private List<String> getTargetBrands(String page) { + List<String> result = new ArrayList<>(); + int i, j, end; + Boolean finding = true; + i = page.indexOf("International/Other Brands"); + end = page.indexOf("<dt", i); + while (finding) { + i = page.indexOf("separated-list-item\">", i); + if (i == -1 || i > end) { + finding = false; + continue; + } + i = i + "separated-list-item\">".length(); + j = page.indexOf("<", i); + String synonym = StringEscapeUtils.unescapeHtml4(page.substring(i, j)); + synonym = cleanHtml(synonym).trim(); + if (synonym.indexOf("(") > 0) { + synonym = synonym.substring(0, synonym.indexOf("(")).trim(); + } + result.add(synonym); + } + return result; + } + + /** + * Returns list of drug synonyms from drugbank database. + * + * @param page + * web page content with drug information + * @return list of drug synonyms + */ + private List<String> getDrugSynonyms(String page) { + List<String> ans = new ArrayList<>(); + + int i, j, end; + Boolean finding = true; + i = page.indexOf("Synonyms"); + end = page.indexOf("Prescription Products", i); + while (finding) { + i = page.indexOf("<li>", i); + if (i == -1 || i > end) { + finding = false; + continue; + } + i = i + "<li>".length(); + j = page.indexOf("<", i); + String synonym = StringEscapeUtils.unescapeHtml4(page.substring(i, j)); + ans.add(cleanHtml(synonym)); + } + + return ans; + } + + /** + * Returns references from reference string (obtained from drugbank homepage) . + * + * @param descriptionString + * string containing references obtained from drugbank webpage + * @return list of references obtained from drugbank {@link String} + */ + List<MiriamData> getPubmedFromRef(String descriptionString) { + List<MiriamData> result = new ArrayList<>(); + int currentStartIndex = 0; + while (true) { + currentStartIndex = descriptionString.indexOf("/pubmed/", currentStartIndex); + if (currentStartIndex == -1) { + break; + } + currentStartIndex = currentStartIndex + "/pubmed/".length(); + int endIndex = descriptionString.indexOf('"', currentStartIndex); + String identifier = descriptionString.substring(currentStartIndex, endIndex); + String correctedIdentifier = identifier.replaceAll("[^\\d]", ""); + if (!identifier.equals(correctedIdentifier)) { + if (correctedIdentifier.length() > MAX_PUBMED_IDENTIFIER_LENGTH) { + correctedIdentifier = correctedIdentifier.substring(0, MAX_PUBMED_IDENTIFIER_LENGTH); + } + logger.warn("Problematic pubmed identfier: \"" + identifier + "\". Trying fixing with the result: " + + correctedIdentifier); + } + if (correctedIdentifier.equals("")) { + break; + } + result.add(new MiriamData(MiriamRelationType.BQ_BIOL_IS_DESCRIBED_BY, MiriamType.PUBMED, correctedIdentifier)); + } + return result; + } + + /** + * Returns targets parsed from page content. + * + * @param page + * webpage content from which data are extracted + * @throws DrugSearchException + * thrown when there are problems with connection to DrugBank database + * @return list of drug {@link Target} parsed from input string + */ + List<Target> getTargetsForDrug(String page) throws DrugSearchException { + List<Target> result = new ArrayList<>(); + try { + int pageStart; + Target target = new Target(); + target.setType(TargetType.SINGLE_PROTEIN); + + pageStart = page.indexOf("bond-list-container targets"); + + int end = page.indexOf("bond-list-container enzymes"); + if (end < 0) { + end = page.indexOf("bond-list-container carriers"); + if (end < 0) { + end = page.indexOf("bond-list-container transporters"); + + if (end < 0) { + end = page.indexOf("<tr id=\"comments\">"); + if (end < 0) { + end = page.length() - 1; + } + } + } + } + + while (pageStart > 0 && pageStart < end) { + int targetStart = page.indexOf("Details</a>", pageStart); + int nextTargetStart = page.indexOf("Details</a>", targetStart + 1); + if (nextTargetStart < 0) { + nextTargetStart = end; + } + + target = parseTarget(page.substring(targetStart, nextTargetStart)); + if (target != null) { + result.add(target); + } + pageStart = nextTargetStart; + + } + } catch (TaxonomySearchException e) { + throw new DrugSearchException("Problem with finidng information about organism", e); + } catch (UniprotSearchException e) { + throw new DrugSearchException("Problem with finidng information about protein", e); + } + return result; + } + + /** + * Parse html info about target into {@link Target} structure. + * + * @param htmlPage + * string with html content + * @return {@link Target} for given html content + * @throws UniprotSearchException + * thrown when there is a problem with accessing uniprot db + * @throws TaxonomySearchException + * thrown when there is a problem with accessing taxonomy db + */ + protected Target parseTarget(String htmlPage) throws UniprotSearchException, TaxonomySearchException { + int kindIndex = htmlPage.indexOf("Kind</dt><dd class=\"col-md-7 col-sm-6\">"); + kindIndex += "Kind</dt><dd class=\"col-md-7 col-sm-6\">".length(); + int endKindIndex = htmlPage.indexOf("</dd>"); + String type = ""; + if (kindIndex > 0 && endKindIndex > kindIndex) { + type = htmlPage.substring(kindIndex, endKindIndex); + } + if (type.trim().equalsIgnoreCase("Protein")) { + int uniprotIdStart = htmlPage.indexOf("/biodb/polypeptides/") + "/biodb/polypeptides/".length(); + Target result = new Target(); + result.setType(TargetType.SINGLE_PROTEIN); + + // Getting ID && Name + int uniprotIdEnd = htmlPage.indexOf('"', uniprotIdStart); + String uniprotId = htmlPage.substring(uniprotIdStart, uniprotIdEnd); + MiriamData uniprotTarget = new MiriamData(MiriamType.UNIPROT, uniprotId); + MiriamData hgncTarget = uniprotAnnotator.uniProtToHgnc(uniprotTarget); + if (hgncTarget != null) { + result.addGene(hgncTarget); + } else { + result.addGene(uniprotTarget); + } + + int nameStart = uniprotIdEnd + 2; + int nameEnd = htmlPage.indexOf("</", uniprotIdStart); + String name = StringEscapeUtils.unescapeHtml4(htmlPage.substring(nameStart, nameEnd)); + result.setName(name); + + // Getting Organism + int organismStart = htmlPage.indexOf("Organism</dt><dd class=\"col-md-7 col-sm-6\">", nameEnd) + + "Organism</dt><dd class=\"col-md-7 col-sm-6\">".length(); + int organismEnd = htmlPage.indexOf("</dd>", organismStart); + String organismString = htmlPage.substring(organismStart, organismEnd); + result.setOrganism(getTaxonomyBackend().getByName(organismString)); + + // Getting References + int referencesStart = htmlPage.indexOf("<h5>References</h5>", organismEnd); + if (referencesStart > 0) { + int referencesEnd = Math.min(htmlPage.indexOf("Details</a>", referencesStart), htmlPage.length()); + if (referencesEnd < 0) { + referencesEnd = htmlPage.length(); + } + result.addReferences(getPubmedFromRef(htmlPage.substring(referencesStart, referencesEnd))); + } + return result; + } else { + logger.warn("Unknown target type: " + type + ". Skipping."); + return null; + } + } + + /** + * Finds information about drug in drugbank database. + * + * @param name + * name of the dug that we are looking for + * @throws DrugSearchException + * thrown when there are problems with connection to DrugBank database + * @return drug with the information obtained from drugbank or <code>null</code> + * if such data couldn't be found + */ + private Drug findMoreInformation(String name) throws DrugSearchException { + Drug result = null; + try { + name = URLEncoder.encode(name, "UTF-8"); + + // minus is to prevent redirection from search that have single result + String accessUrl = URLB + name + "+-"; + + String content = getWebPageContent(accessUrl); + + String[] lines = content.split("\n"); + for (String inputLine : lines) { + String tmp = findIdInText(inputLine); + if (tmp != null) { + result = new Drug(); + result.addSource(new MiriamData(MiriamRelationType.BQ_BIOL_IS_DESCRIBED_BY, MiriamType.DRUGBANK, tmp)); + result.setName(super.cleanHtml(findNameInText(inputLine))); + break; + } + } + + if (result != null) { + + accessUrl = URLDRUG + result.getSources().get(0).getResource(); + + String page = getWebPageContent(accessUrl); + + result.setDescription(getDescriptionForDrug(page)); + + result.setSynonyms(getDrugSynonyms(page)); + + // Getting drug brand names + result.setBrandNames(getTargetBrands(page)); + + result.addTargets(getTargetsForDrug(page)); + + result.setBloodBrainBarrier(getBloodBrainBarrier(page)); + + result.setApproved(getApproved(page)); + + if (!nameMatch(result, name)) { + result = null; + } + } + } catch (IOException e) { + throw new DrugSearchException(e); + } + return result; + } + + /** + * Check if drug data mateches with the search name. + * + * @param drug + * drug to be checked + * @param name + * name of the drug that we were looking for + * @return true if if drug data matches with the search name + */ + private boolean nameMatch(Drug drug, String name) { + Set<String> foundNames = new HashSet<>(); + foundNames.add(drug.getName()); + foundNames.addAll(drug.getSynonyms()); + foundNames.addAll(drug.getBrandNames()); + String lowerCaseName; + try { + lowerCaseName = java.net.URLDecoder.decode(name, "UTF-8").toLowerCase().replaceAll("[^A-Za-z0-9]", ""); + } catch (UnsupportedEncodingException e) { + lowerCaseName = name.toLowerCase().replaceAll("[^A-Za-z0-9]", ""); + } + for (String string : foundNames) { + String query = string.toLowerCase().replaceAll("[^A-Za-z0-9]", ""); + if (query.contains(lowerCaseName)) { + return true; + } + } + return false; + } + + /** + * Finds blood brain barrier info about drug in the webpage content. + * + * @param page + * webpage content used for parsing + * + * @return status of blood brain barrier + */ + String getBloodBrainBarrier(String page) { + int index = page.indexOf("<td>Blood Brain Barrier</td>"); + if (index <= 0) { + return "N/A"; + } + + Matcher matcher = bloodBrainBarrierPattern.matcher(page.substring(index)); + + matcher.find(); + if (!matcher.find()) { + logger.warn("Invalid html for Blood Brain Barrier..."); + return "N/A"; + } + String match = matcher.group(1); + if (match.contains("+")) { + return "YES"; + } else if (match.contains("-")) { + return "NO"; + } else { + logger.warn("Unknown Blood Brain Barrier status: " + match); + return "N/A"; + } + } + + /** + * Finds if drug is approved. + * + * @param page + * webpage content used for parsing + * + * @return <code>true</code> if drug is approved + */ + private Boolean getApproved(String page) { + int index = page.indexOf("<dt class=\"col-md-2 col-sm-4\">Groups</dt>"); + if (index <= 0) { + return null; + } + + int end = page.indexOf("</dd>", index + 1); + + if (end < index) { + end = page.length(); + } + String match = page.substring(index, end).toLowerCase(); + if (match.contains("approved")) { + if (match.contains("investigational") || match.contains("withdrawn") || match.contains("experimental")) { + logger.warn("Contradicting info about approved status: " + match); + return null; + } + return true; + } else if (match.contains("investigational") || match.contains("withdrawn") || match.contains("experimental")) { + return false; + } else { + logger.warn("Unknown approved status: " + match); + return null; + } + } + + @Override + public Drug findDrug(String drugName) throws DrugSearchException { + String query = DRUG_NAME_PREFIX + drugName; + Drug drug = null; + try { + drug = getDrugSerializer().xmlToObject(getCacheNode(query)); + } catch (SerializationException e) { + logger.error("Problem with deserializing element by query: " + query); + } + if (drug != null) { + return drug; + } + + drug = findMoreInformation(drugName); + + try { + setCacheValue(query, getDrugSerializer().objectToString(drug)); + } catch (SerializationException e) { + logger.error("Problem with serializing element "); + } + + return drug; + } + + @Override + public ExternalServiceStatus getServiceStatus() { + ExternalServiceStatus status = new ExternalServiceStatus("DrugBank", URL); + + GeneralCacheInterface cacheCopy = getCache(); + this.setCache(null); + + try { + Drug drug = findDrug("Amantadine"); + status.setStatus(ExternalServiceStatusType.OK); + if (drug == null) { + status.setStatus(ExternalServiceStatusType.CHANGED); + } + } catch (Exception e) { + logger.error("DrugBank is down", e); + status.setStatus(ExternalServiceStatusType.DOWN); + } + this.setCache(cacheCopy); + return status; + } + + @Override + public List<Drug> getDrugListByTarget(MiriamData targetMiriamData, Collection<MiriamData> organisms) + throws DrugSearchException { + List<Drug> result = new ArrayList<>(); + if (targetMiriamData == null) { + return result; + } + if (!(MiriamType.HGNC_SYMBOL.equals(targetMiriamData.getDataType()))) { + throw new InvalidArgumentException("Only " + MiriamType.HGNC_SYMBOL + " type is accepted"); + } + String url = URL_TARGETS + targetMiriamData.getResource(); + + try { + String page = getWebPageContent(url); + + Set<String> drugNames = new HashSet<>(); + + Matcher matcher = targetPattern.matcher(page); + while (matcher.find()) { + String drugbankTargetId = matcher.group(0); + drugNames.addAll(getDrugNamesForTarget(new MiriamData(MiriamType.DRUGBANK_TARGET_V4, drugbankTargetId), + targetMiriamData, organisms)); + } + for (String string : drugNames) { + Drug drug = findDrug(string); + if (drug == null) { + logger.warn("Cannot find drug that should be there: " + string); + } else { + boolean targets = false; + for (Target target : drug.getTargets()) { + for (MiriamData gene : target.getGenes()) { + if (gene.equals(targetMiriamData)) { + targets = true; + } + } + } + if (targets) { + result.add(drug); + } else { + logger.debug("Skipping drug that doesn't target required target. Drug name: " + drug.getName() + + "; target: " + targetMiriamData); + } + } + } + + return result; + } catch (IOException e) { + throw new DrugSearchException("Cannot access drug database", e); + } + } + + /** + * Returns list of drugs that target element (target) identified be drugbank + * identifier. + * + * @param drugbankTarget + * {@link MiriamType#DRUGBANK_TARGET_V4 identifier} of the target + * @param hgncTarget + * identifier of the target using {@link MiriamType#HGNC_SYMBOL}, used + * for verification if the target really points to proper protein/gene + * @param organisms + * list of organisms to which results should be limited (when no + * organisms defined filtering will be turned off) + * @return list of drugs that target this target + * @throws DrugSearchException + * thrown when there are problems with connection to DrugBank database + */ + private Set<String> getDrugNamesForTarget(MiriamData drugbankTarget, MiriamData hgncTarget, + Collection<MiriamData> organisms) throws DrugSearchException { + if (!MiriamType.DRUGBANK_TARGET_V4.equals(drugbankTarget.getDataType())) { + throw new InvalidArgumentException("drugbankTarget must be of type: " + MiriamType.DRUGBANK_TARGET_V4); + } + + try { + Set<String> drugNames = new HashSet<>(); + String url = URL_TARGET_DETAIL + drugbankTarget.getResource(); + + String page = getWebPageContent(url); + + int idPosition = page.indexOf("<th>DrugBank ID</th>"); + if (idPosition < 0) { + throw new DrugSearchException("Problematic web page for target: " + drugbankTarget + "(" + hgncTarget + ")"); + } + + int protienLinkPosition = page.indexOf("/polypeptides/"); + // sometimes there might not be an element + if (protienLinkPosition >= 0) { + protienLinkPosition = protienLinkPosition + "/polypeptides/".length(); // 20; + int j = page.indexOf('"', protienLinkPosition); + String uniprotId = page.substring(protienLinkPosition, j); + MiriamData uniprotMiriam = new MiriamData(MiriamType.UNIPROT, uniprotId); + MiriamData hgncMiriam = uniprotAnnotator.uniProtToHgnc(uniprotMiriam); + if (hgncMiriam == null || !hgncMiriam.equals(hgncTarget)) { + logger.debug("Invalid target found. Expected " + hgncTarget + ", but found: " + hgncMiriam + " (" + + uniprotMiriam + ")"); + return drugNames; + } + + } else { + logger.warn("Invalid target found. No protein data available."); + return drugNames; + } + + int organismPosition = page.indexOf("Organism</dt><dd class=\"col-md-10 col-sm-8\">"); + if (organismPosition >= 0) { + int organismStart = organismPosition+ "Organism</dt><dd class=\"col-md-10 col-sm-8\">".length(); + int organismEnd = page.indexOf("<", organismStart + 1); + String organismName = page.substring(organismStart, organismEnd); + if (!organismMatch(organismName, organisms)) { + logger.debug("Organism doesn't match. Found" + organismName + ". Expected: " + organisms); + return drugNames; + } + } + + Matcher matcher = drugNamePattern.matcher(page.substring(idPosition)); + + while (matcher.find()) { + drugNames.add(matcher.group(1)); + } + + return drugNames; + } catch (IOException e) { + throw new DrugSearchException("Problem with accessing drugbank db", e); + } catch (UniprotSearchException e) { + throw new DrugSearchException("Problem with uniprot annotations", e); + } + } + + /** + * @return the uniprotAnnotator + * @see #uniprotAnnotator + */ + public UniprotAnnotator getUniprotAnnotator() { + return uniprotAnnotator; + } + + /** + * @param uniprotAnnotator + * the uniprotAnnotator to set + * @see #uniprotAnnotator + */ + public void setUniprotAnnotator(UniprotAnnotator uniprotAnnotator) { + this.uniprotAnnotator = uniprotAnnotator; + } + + @Override + protected WebPageDownloader getWebPageDownloader() { + return super.getWebPageDownloader(); + } + + @Override + protected void setWebPageDownloader(WebPageDownloader webPageDownloader) { + super.setWebPageDownloader(webPageDownloader); + } + +} \ No newline at end of file diff --git a/persist/src/db/11.0.5/fix_db_20171109.sql b/persist/src/db/11.0.5/fix_db_20171109.sql new file mode 100644 index 0000000000000000000000000000000000000000..3e427bea81e51a58776c8af9fd8d72bae191da94 --- /dev/null +++ b/persist/src/db/11.0.5/fix_db_20171109.sql @@ -0,0 +1,2 @@ +---clear drugbank cache +delete from cachequery where type = (select iddb from cache_type where classname='lcsb.mapviewer.annotation.services.DrugbankHTMLParser');