generate.py 3.46 KB
Newer Older
1
2
import gzip
import ntpath
Piotr Gawron's avatar
Piotr Gawron committed
3
4
import os
import subprocess
5
import sys
Piotr Gawron's avatar
Piotr Gawron committed
6
7
import urllib.request
from pathlib import Path
8

9
10
BIN_PATH = "bin/"

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

def transform(input_line):
    chromosome = input_line[2]
    chromosome_start = input_line[4]
    chromosome_end = input_line[5]
    name = input_line[12]
    score = input_line[11]
    strand = input_line[3]
    thick_start = input_line[6]
    thick_end = input_line[7]
    item_rgb = "255,0,0"
    block_count = input_line[8]
    block_starts = []
    block_sizes = []

    block_starts_tmp = input_line[9].split(",")
    block_ends_tmp = input_line[10].split(",")
    for i in range(0, int(block_count)):
29
        block_starts.append(str(int(block_starts_tmp[i]) - int(chromosome_start)))
30
        block_sizes.append(str(int(block_ends_tmp[i]) - int(block_starts_tmp[i])))
Piotr Gawron's avatar
Piotr Gawron committed
31
32
33
    output_line = [chromosome, chromosome_start, chromosome_end, name, score, strand, thick_start, thick_end, item_rgb,
                   block_count, ",".join(block_sizes), ",".join(block_starts)]
    return output_line
34
35


36
def downloadUcscFile(filename):
37
38
    if not os.path.isdir(BIN_PATH):
        os.makedirs(BIN_PATH)
39

40
    if not Path(BIN_PATH + filename).is_file():
41
        url = 'http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/' + filename
42
43
44
45
        print('Beginning file download ' + filename + ' (' + url + ')')

        urllib.request.urlretrieve(url, BIN_PATH + filename)
        os.chmod(BIN_PATH + filename, 0o755)
46
47


48
inputFile = sys.argv[1]
49
db = sys.argv[2]
Piotr Gawron's avatar
Piotr Gawron committed
50
basename = ntpath.basename(inputFile).replace(".txt.gz", "")
51
52
53
54
55
56

downloadUcscFile("fetchChromSizes")

chrom_sizes = basename + ".chrom.sizes"
with open(chrom_sizes, 'w') as output_chrom_sizes:
    print('Fetching chrom sizes: ' + chrom_sizes)
57
    subprocess.call([BIN_PATH + "fetchChromSizes", db], stdout=output_chrom_sizes)
58
59
60
61
62
63
64
65
66

chromosomes = {}
unkown_chromosomes = {}

with open(chrom_sizes) as f:
    lines = f.readlines()
    for line in lines:
        chromosomes[line.split("\t")[0]] = True

Piotr Gawron's avatar
Piotr Gawron committed
67
output_file_unsorted = basename + ".bed"
68

Piotr Gawron's avatar
Piotr Gawron committed
69
70
71
output = open(output_file_unsorted, "w")

print('Generating bed file: ' + output_file_unsorted)
72

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
if inputFile.endswith('gz'):
    with gzip.open(inputFile, 'rt') as hIN:
        for line in hIN:
            F = line.rstrip('\n').split('\t')
            output_line = transform(F)
            if output_line[0] in chromosomes:
                print("\t".join(output_line), file=output)
            else:
                if not output_line[0] in unkown_chromosomes:
                    print("Unknown chromosome '" + output_line[0] + "'")
                unkown_chromosomes[output_line[0]] = True
else:
    with open(inputFile) as f:
        lines = f.readlines()
        for line in lines:
            F = line.rstrip('\n').split('\t')
            output_line = transform(F)
            if output_line[0] in chromosomes:
                print("\t".join(output_line), file=output)
            else:
                if not output_line[0] in unkown_chromosomes:
                    print("Unknown chromosome '" + output_line[0] + "'")
                unkown_chromosomes[output_line[0]] = True

97
output.close()
Piotr Gawron's avatar
Piotr Gawron committed
98

99
downloadUcscFile("bedSort")
Piotr Gawron's avatar
Piotr Gawron committed
100
101

output_file_sorted = basename + "-sorted.bed"
102
print('Sorting bed file: ' + output_file_sorted)
103
subprocess.call([BIN_PATH + "bedSort", output_file_unsorted, output_file_sorted])
Piotr Gawron's avatar
Piotr Gawron committed
104
105

downloadUcscFile("bedToBigBed")
106
107
108
109

output_big_bed = basename + ".bb"

print('Creating big bed file: ' + output_big_bed)
110
subprocess.call([BIN_PATH + "bedToBigBed", output_file_sorted, chrom_sizes, output_big_bed])