Commit 3d66124d authored by Piotr Gawron's avatar Piotr Gawron
Browse files

first version of big bed generator

parent a714e91e
......@@ -2,4 +2,6 @@
bin
venv/
*.bed
*.bb
*.sizes
*.txt.gz
import gzip
import ntpath
import os
import sys
import subprocess
import sys
import urllib.request
from pathlib import Path
......@@ -24,15 +24,41 @@ def transform(input_line):
block_starts_tmp = input_line[9].split(",")
block_ends_tmp = input_line[10].split(",")
for i in range(0, int(block_count)):
block_starts.append(block_starts_tmp[i])
block_starts.append(str(int(block_starts_tmp[i]) - int(chromosome_start)))
block_sizes.append(str(int(block_ends_tmp[i]) - int(block_starts_tmp[i])))
output_line = [chromosome, chromosome_start, chromosome_end, name, score, strand, thick_start, thick_end, item_rgb,
block_count, ",".join(block_sizes), ",".join(block_starts)]
return output_line
def downloadUcscFile(filename):
if not Path('bin/' + filename).is_file():
print('Beginning file download ' + filename + '...')
url = 'http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/' + filename
urllib.request.urlretrieve(url, 'bin/' + filename)
os.chmod('bin/' + filename, 0o755)
inputFile = sys.argv[1]
db = sys.argv[2]
basename = ntpath.basename(inputFile).replace(".txt.gz", "")
downloadUcscFile("fetchChromSizes")
chrom_sizes = basename + ".chrom.sizes"
with open(chrom_sizes, 'w') as output_chrom_sizes:
print('Fetching chrom sizes: ' + chrom_sizes)
subprocess.call(["bin/fetchChromSizes", db], stdout=output_chrom_sizes)
chromosomes = {}
unkown_chromosomes = {}
with open(chrom_sizes) as f:
lines = f.readlines()
for line in lines:
chromosomes[line.split("\t")[0]] = True
output_file_unsorted = basename + ".bed"
output = open(output_file_unsorted, "w")
......@@ -43,27 +69,26 @@ with gzip.open(inputFile, 'rt') as hIN:
for line in hIN:
F = line.rstrip('\n').split('\t')
output_line = transform(F)
print("\t".join(output_line), file=output)
if output_line[0] in chromosomes:
print("\t".join(output_line), file=output)
else:
if not output_line[0] in unkown_chromosomes:
print("Unknown chromosome '" + output_line[0] + "'")
unkown_chromosomes[output_line[0]] = True
output.close()
if not Path('bin').is_dir():
os.mkdir('bin')
def downloadUcscFile(filename):
if not Path('bin/' + filename).is_file():
print('Beginning file download ' + filename + '...')
url = 'http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/' + filename
urllib.request.urlretrieve(url, 'bin/' + filename)
os.chmod('bin/' + filename, 0o755)
downloadUcscFile("sortBed")
output_file_sorted = basename + "-sorted.bed"
print('Sorting bed file: ')
print('Sorting bed file: ' + output_file_sorted)
subprocess.call(["bin/sortBed", output_file_unsorted, output_file_sorted])
downloadUcscFile("bedToBigBed")
downloadUcscFile("fetchChromSizes")
output_big_bed = basename + ".bb"
print('Creating big bed file: ' + output_big_bed)
subprocess.call(["bin/bedToBigBed", output_file_sorted, chrom_sizes, output_big_bed])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment