generate.py 1.98 KB
Newer Older
1
2
import gzip
import ntpath
Piotr Gawron's avatar
Piotr Gawron committed
3
import os
4
import sys
Piotr Gawron's avatar
Piotr Gawron committed
5
6
7
import subprocess
import urllib.request
from pathlib import Path
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


def transform(input_line):
    chromosome = input_line[2]
    chromosome_start = input_line[4]
    chromosome_end = input_line[5]
    name = input_line[12]
    score = input_line[11]
    strand = input_line[3]
    thick_start = input_line[6]
    thick_end = input_line[7]
    item_rgb = "255,0,0"
    block_count = input_line[8]
    block_starts = []
    block_sizes = []

    block_starts_tmp = input_line[9].split(",")
    block_ends_tmp = input_line[10].split(",")
    for i in range(0, int(block_count)):
        block_starts.append(block_starts_tmp[i])
        block_sizes.append(str(int(block_ends_tmp[i]) - int(block_starts_tmp[i])))
Piotr Gawron's avatar
Piotr Gawron committed
29
30
31
    output_line = [chromosome, chromosome_start, chromosome_end, name, score, strand, thick_start, thick_end, item_rgb,
                   block_count, ",".join(block_sizes), ",".join(block_starts)]
    return output_line
32
33
34


inputFile = sys.argv[1]
Piotr Gawron's avatar
Piotr Gawron committed
35
36
basename = ntpath.basename(inputFile).replace(".txt.gz", "")
output_file_unsorted = basename + ".bed"
37

Piotr Gawron's avatar
Piotr Gawron committed
38
39
40
output = open(output_file_unsorted, "w")

print('Generating bed file: ' + output_file_unsorted)
41
42
43
44
45
46
47

with gzip.open(inputFile, 'rt') as hIN:
    for line in hIN:
        F = line.rstrip('\n').split('\t')
        output_line = transform(F)
        print("\t".join(output_line), file=output)
output.close()
Piotr Gawron's avatar
Piotr Gawron committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

if not Path('bin').is_dir():
    os.mkdir('bin')


def downloadUcscFile(filename):
    if not Path('bin/' + filename).is_file():
        print('Beginning file download ' + filename + '...')

        url = 'http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/' + filename
        urllib.request.urlretrieve(url, 'bin/' + filename)
        os.chmod('bin/' + filename, 0o755)


downloadUcscFile("sortBed")

output_file_sorted = basename + "-sorted.bed"
print('Sorting bed file: ')
subprocess.call(["bin/sortBed", output_file_unsorted, output_file_sorted])

downloadUcscFile("bedToBigBed")
downloadUcscFile("fetchChromSizes")