Commit 7c40dabc authored by Piotr Gawron's avatar Piotr Gawron
Browse files

Merge branch '370-genes-annotations' into 'master'

Resolve "Genes annotations don't show"

Closes #370 and #19

See merge request minerva/core!345
parents 221341ba 4e9f4bf0
Pipeline #5772 passed with stage
in 1 minute and 56 seconds
......@@ -401,7 +401,11 @@ public class ChemicalParser extends CachableInterface implements IExternalServic
}
if (result != null) {
MeSH mesh = meshParser.getMeSH(result.getChemicalId());
if (mesh!=null) {
result.addSynonyms(mesh.getSynonyms());
} else {
logger.warn("Problematic mesh id: "+result.getChemicalId());
}
}
} catch (IOException e) {
......
......@@ -237,17 +237,17 @@ public class HgncAnnotator extends ElementAnnotator implements IExternalService
}
}
}
}
}
}
} catch (WrongResponseCodeIOException e) {
logger.warn(prefix + "Cannot find information for element.");
} catch (Exception e) {
throw new AnnotatorException(e);
}
}
}
}
}
}
}
} catch (WrongResponseCodeIOException e) {
logger.warn(prefix + "Cannot find information for element.");
} catch (Exception e) {
throw new AnnotatorException(e);
}
}
}
/**
* Creates query url for given {@link MiriamType#HGNC} identifier.
......@@ -265,10 +265,12 @@ public class HgncAnnotator extends ElementAnnotator implements IExternalService
*
* @param name
* {@link MiriamType#HGNC_SYMBOL}
* @return url to restful api webpage for given hgnc symbol
* @return url to restful API web page for given HGNC symbol
*/
private String getHgncNameUrl(String name) {
return REST_API_URL + "symbol/" + name;
String hgncSymbol = "" + name;
hgncSymbol = hgncSymbol.split("\\s+")[0];
return REST_API_URL + "symbol/" + hgncSymbol;
}
/**
......@@ -304,34 +306,34 @@ public class HgncAnnotator extends ElementAnnotator implements IExternalService
} else {
Node entry = getNode("doc", resultNode.getChildNodes());
NodeList list = entry.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
Node node = list.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
if (node.getNodeName().equals("arr")) {
String type = getNodeAttr("name", node);
if (type.equals("uniprot_ids")) {
NodeList uniprotList = node.getChildNodes();
for (int j = 0; j < uniprotList.getLength(); j++) {
Node uniprotNode = uniprotList.item(j);
if (uniprotNode.getNodeType() == Node.ELEMENT_NODE) {
if (uniprotNode.getNodeName().equals("str")) {
result.add(createMiriamData(MiriamType.UNIPROT, uniprotNode.getTextContent()));
}
}
}
}
}
}
}
}
return result;
} catch (WrongResponseCodeIOException e) {
logger.warn("No HGNC data found for id: "+miriamData);
return new ArrayList<>();
} catch (Exception e) {
throw new AnnotatorException(e);
}
NodeList list = entry.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
Node node = list.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
if (node.getNodeName().equals("arr")) {
String type = getNodeAttr("name", node);
if (type.equals("uniprot_ids")) {
NodeList uniprotList = node.getChildNodes();
for (int j = 0; j < uniprotList.getLength(); j++) {
Node uniprotNode = uniprotList.item(j);
if (uniprotNode.getNodeType() == Node.ELEMENT_NODE) {
if (uniprotNode.getNodeName().equals("str")) {
result.add(createMiriamData(MiriamType.UNIPROT, uniprotNode.getTextContent()));
}
}
}
}
}
}
}
}
return result;
} catch (WrongResponseCodeIOException e) {
logger.warn("No HGNC data found for id: " + miriamData);
return new ArrayList<>();
} catch (Exception e) {
throw new AnnotatorException(e);
}
}
/**
......
......@@ -127,7 +127,7 @@ public interface ReferenceGenomeConnector {
* @param version
* version of the reference genome
* @throws IOException
* thrown when there is a problem with removeing file
* thrown when there is a problem with removing file
*/
void removeGenomeVersion(MiriamData organism, String version) throws IOException;
......@@ -135,7 +135,7 @@ public interface ReferenceGenomeConnector {
* Returns url to the file that describes reference genome.
*
* @param organism
* organism of redference genome
* organism of reference genome
* @param version
* version of the reference genome
* @return url to the file that describes reference genome
......
......@@ -42,429 +42,433 @@ import lcsb.mapviewer.model.map.layout.ReferenceGenomeType;
*/
public class UcscReferenceGenomeConnector extends AbstractReferenceGenomeConnector implements ReferenceGenomeConnector {
/**
* Server domain name.
*/
private static final String SERVER = "hgdownload.cse.ucsc.edu";
/**
* Prefix string used for marking queries in cache database that identifies
* list of reference genome versions by organism id.
*/
static final String FILENAME_BY_ORGANISM_VERSION_PREFIX = "ORGANISM_VERSION_FILE:";
/**
* Default class logger.
*/
private Logger logger = Logger.getLogger(UcscReferenceGenomeConnector.class);
/**
* Regex pattern that helps to find out organism names in source file.
*/
private Pattern organismNamePattern = Pattern.compile("<!--([A-Za-z\\-\\.\\ ]+)Downloads [=]+ -->");
/**
* Regex pattern that helps to find out reference genome versions.
*/
private Pattern organismDataUrlPattern = Pattern.compile("\\/goldenPath\\/([A-Za-z0-9\\-\\.]+)\\/bigZips\\/");
/**
* Access point to taxonomy information.
*/
@Autowired
private TaxonomyBackend taxonomyBackend;
/**
* Default constructor.
*/
public UcscReferenceGenomeConnector() {
super(UcscReferenceGenomeConnector.class);
}
@Override
public List<String> getDownloadedGenomeVersions(MiriamData organism) {
List<String> results = new ArrayList<>();
List<ReferenceGenome> genomes = getReferenceGenomeDao().getByType(ReferenceGenomeType.UCSC);
for (ReferenceGenome referenceGenome : genomes) {
if (referenceGenome.getOrganism().equals(organism)) {
results.add(referenceGenome.getVersion());
}
}
return results;
}
@Override
public List<String> getAvailableGenomeVersion(MiriamData organism) throws ReferenceGenomeConnectorException {
Set<String> ids = new HashSet<>();
try {
String content = getWebPageContent("http://hgdownload.cse.ucsc.edu/downloads.html");
Integer start = null;
Integer end = content.length();
Matcher matcher = organismNamePattern.matcher(content);
while (matcher.find()) {
String name = matcher.group(1).trim();
if (start != null) {
end = matcher.start();
break;
}
if (name.equalsIgnoreCase("Shared Data")) {
continue;
}
if (name.equalsIgnoreCase("liftOver File")) {
continue;
}
MiriamData taxonomy = taxonomyBackend.getByName(name);
if (organism.equals(taxonomy)) {
start = matcher.end();
}
}
// we haven't found a start point for our organism (organism couldn't be
// found in the list of available organisms)
if (start != null) {
String organismContent = content.substring(start, end);
matcher = organismDataUrlPattern.matcher(organismContent);
while (matcher.find()) {
String name = matcher.group(1).trim();
ids.add(name);
}
}
} catch (IOException | TaxonomySearchException e) {
throw new ReferenceGenomeConnectorException("Problem with accessing UCSC database", e);
}
List<String> result = new ArrayList<>();
result.addAll(ids);
Collections.sort(result, new Comparator<String>() {
public int compare(String o1, String o2) {
return extractInt(o2) - extractInt(o1);
}
});
return result;
}
@Override
public List<MiriamData> getAvailableOrganisms() throws ReferenceGenomeConnectorException {
try {
List<MiriamData> result = new ArrayList<>();
String content = getWebPageContent("http://hgdownload.cse.ucsc.edu/downloads.html");
Matcher matcher = organismNamePattern.matcher(content);
while (matcher.find()) {
String name = matcher.group(1).trim();
if (name.equalsIgnoreCase("Shared Data")) {
continue;
}
if (name.equalsIgnoreCase("liftOver File")) {
continue;
}
MiriamData taxonomy = taxonomyBackend.getByName(name);
if (taxonomy != null) {
result.add(taxonomy);
}
}
return result;
} catch (IOException | TaxonomySearchException e) {
throw new ReferenceGenomeConnectorException("Problem with accessing UCSC database", e);
}
}
@Override
public void downloadGenomeVersion(MiriamData organism, String version, IProgressUpdater updater, boolean async)
throws FileNotAvailableException, IOException, ReferenceGenomeConnectorException {
try {
downloadGenomeVersion(organism, version, updater, async, getGenomeVersionFile(organism, version));
} catch (URISyntaxException e) {
throw new InvalidStateException(e);
}
}
@Override
public Object refreshCacheQuery(Object query) throws SourceNotAvailable {
String result = null;
try {
if (query instanceof String) {
String name = (String) query;
if (name.startsWith("http")) {
result = getWebPageContent(name);
} else if (name.startsWith(FILENAME_BY_ORGANISM_VERSION_PREFIX)) {
String[] tmp = name.substring(FILENAME_BY_ORGANISM_VERSION_PREFIX.length()).split("\n");
result = getGenomeVersionFile(new MiriamData(MiriamType.TAXONOMY, tmp[0]), tmp[1]);
} else {
throw new InvalidArgumentException("Don't know what to do with string \"" + query + "\"");
}
} else {
throw new InvalidArgumentException("Don't know what to do with class: " + query.getClass());
}
} catch (FileNotAvailableException e) {
throw new SourceNotAvailable("Cannot find file for the query: " + query, e);
} catch (IOException e) {
throw new SourceNotAvailable(e);
}
return result;
}
@Override
public void removeGenomeVersion(MiriamData organism, String version) throws IOException {
List<ReferenceGenome> genomes = getReferenceGenomeDao().getByType(ReferenceGenomeType.UCSC);
for (ReferenceGenome referenceGenome : genomes) {
if (referenceGenome.getOrganism().equals(organism) && referenceGenome.getVersion().equals(version)) {
// removing file from big file cache might not be the best idea here
if (getBigFileCache().isCached(referenceGenome.getSourceUrl())) {
getBigFileCache().removeFile(referenceGenome.getSourceUrl());
}
for (ReferenceGenomeGeneMapping mapping : referenceGenome.getGeneMapping()) {
if (getBigFileCache().isCached(mapping.getSourceUrl())) {
getBigFileCache().removeFile(mapping.getSourceUrl());
}
}
getReferenceGenomeDao().delete(referenceGenome);
}
}
}
/**
* @return the taxonomyBackend
* @see #taxonomyBackend
*/
public TaxonomyBackend getTaxonomyBackend() {
return taxonomyBackend;
}
/**
* @param taxonomyBackend
* the taxonomyBackend to set
* @see #taxonomyBackend
*/
public void setTaxonomyBackend(TaxonomyBackend taxonomyBackend) {
this.taxonomyBackend = taxonomyBackend;
}
/**
* Task that will be able to fetch genome file from ftp server.
*
* @author Piotr Gawron
*
*/
private final class DownloadGenomeVersionTask implements Callable<Void> {
/**
* Url to the file that we want to download.
*
*/
private String url;
/**
* Callback listener that will receive information about upload progress.
*
*/
private IProgressUpdater updater;
/**
* Organism for which we want to fetch genome.
*/
private MiriamData organism;
/**
* Version of the genome.
*/
private String version;
/**
* Default constructor.
*
* @param url
* {@link #url}
* @param updater
* {@link #updater}
* @param organism
* {@link #organism}
* @param version
* {@link #version}
*/
private DownloadGenomeVersionTask(MiriamData organism, String version, String url, IProgressUpdater updater) {
this.url = url;
this.organism = organism;
this.version = version;
if (updater != null) {
this.updater = updater;
} else {
this.updater = new IProgressUpdater() {
@Override
public void setProgress(double progress) {
}
};
}
}
@Override
public Void call() throws Exception {
getDbUtils().createSessionForCurrentThread();
try {
ReferenceGenome referenceGenome = new ReferenceGenome();
referenceGenome.setOrganism(organism);
referenceGenome.setType(ReferenceGenomeType.UCSC);
referenceGenome.setVersion(version);
referenceGenome.setSourceUrl(url);
getReferenceGenomeDao().add(referenceGenome);
getReferenceGenomeDao().flush();
getReferenceGenomeDao().commit();
getDbUtils().closeSessionForCurrentThread();
getBigFileCache().downloadFile(url, false, new IProgressUpdater() {
@Override
public void setProgress(double progress) {
if (updater != null) {
updater.setProgress(progress);
}
// we have to get the object because it's in separate thred
ReferenceGenome temp = getReferenceGenomeDao().getById(referenceGenome.getId());
temp.setDownloadProgress(progress);
getReferenceGenomeDao().update(temp);
getReferenceGenomeDao().commit();
}
});
return null;
} finally {
getDbUtils().closeSessionForCurrentThread();
}
}
}
@Override
public void downloadGenomeVersion(MiriamData organism, String version, IProgressUpdater updater, boolean async, String customUrl)
throws IOException, URISyntaxException, ReferenceGenomeConnectorException {
Callable<Void> computations = new DownloadGenomeVersionTask(organism, version, customUrl, updater);
if (async) {
getAsyncExecutorService().submit(computations);
} else {
Future<Void> task = getSyncExecutorService().submit(computations);
executeTask(task);
}
}
/**
* Returns local path on ftp server to folder with data about given organism
* and version.
*
* @param organism
* organism of reference genome
* @param version
* of reference genome
* @return local path on ftp server to folder with data about reference genome
*/
private String getGenomePath(MiriamData organism, String version) {
return "/goldenPath/" + version + "/bigZips/";
}
@Override
public String getGenomeVersionFile(MiriamData organism, String version) throws FileNotAvailableException {
String filename = super.getCacheValue(FILENAME_BY_ORGANISM_VERSION_PREFIX + organism.getResource() + "\n" + version);
if (filename != null) {
return filename;
}
FTPClient ftp = createFtpClient();
try {
ftp.connect(SERVER);
// After connection attempt, you should check the reply code to verify
// success.
int reply = ftp.getReplyCode();
if (!FTPReply.isPositiveCompletion(reply)) {
throw new FileNotAvailableException("Cannot find file with genome for: " + organism + "; " + version + ". FTP server refused connection.");
} else {
ftp.enterLocalPassiveMode();
ftp.login("anonymous", "");
ftp.setFileType(FTP.BINARY_FILE_TYPE);
String remotePath = getGenomePath(organism, version);
FTPFile[] files = ftp.listFiles(remotePath);
for (FTPFile ftpFile : files) {
if (ftpFile.getName().endsWith(".2bit")) {
if (filename != null) {
logger.warn("More than one 2bit file found in a folder: " + remotePath + ". Using first: " + filename);
} else {
filename = ftpFile.getName();
}
}
}
ftp.logout();
}
} catch (IOException e) {
throw new FileNotAvailableException(e);
} finally {
if (ftp.isConnected()) {
try {
ftp.disconnect();
} catch (IOException ioe) {
throw new FileNotAvailableException("Cannot find file with genome for: " + organism + "; " + version + ". Problem with ftp connection.", ioe);
}
}
}
if (filename == null) {
throw new FileNotAvailableException("Cannot find file with genome for: " + organism + "; " + version);
}
String result = "ftp://" + SERVER + getGenomePath(organism, version) + filename;
super.setCacheValue(FILENAME_BY_ORGANISM_VERSION_PREFIX + organism.getResource() + "\n" + version, result);
return result;
}
/**
* Creates new instance of {@link FTPClient}.
*
* @return new instance of {@link FTPClient}
*/
FTPClient createFtpClient() {
FTPClient ftp = new FTPClient();
return ftp;
}
/**
* Extracts int from the version of the genome. The genome version look like
* follow: xxxx011.
*
* @param s
* genome version where suffix part is integer number that informs
* about version
* @return {@link Integer} representing version of the genome from string that
* describes genome version (it contains also some letters characters}
*/
int extractInt(String s) {
int startIndex = 0;
int endIndex = s.length() - 2;
for (int i = 0; i < s.length(); i++) {
startIndex = i;
if (s.charAt(i) >= '0' && s.charAt(i) <= '9') {
break;
}
}
for (int i = startIndex; i < s.length(); i++) {
if (s.charAt(i) < '0' || s.charAt(i) > '9') {
break;
}
endIndex = i;
}
endIndex++;
if (startIndex >= endIndex) {
return 0;
} else {