VCF processing
Closed this issue · 0 comments
carolinecunningham commented
mutagene currently can't process VCFs due to updates in MAF reader
steps to reproduce error:
run: mutagene motif search -g hg19 -i sample2.vcf
caroline's draft code to fix:
in motif_menu.py:
if ".vcf" not in args.infile.name:
mutations, mutations_with_context, processing_stats = read_MAF_with_context_window(args.infile, args.genome, args.window_size)
if len(mutations_with_context) == 0:
logger.warning("No mutations loaded")
matching_motifs = identify_motifs(mutations_with_context, custom_motif,
args.strand) if mutations_with_context is not None else []
else:
mutations, mutations_with_context, processing_stats = read_VCF_with_context_window(args.infile, args.genome, args.window_size)
if len(mutations_with_context) == 0:
logger.warning("No mutations loaded")
matching_motifs = identify_motifs(mutations_with_context, custom_motif,
args.strand) if mutations_with_context is not None else []
in motifs/init.py:
try:
for sample, mutations in samples_mutations.items():
if mutations is not None and len(mutations) > 0:
first_mut_seq_with_coords = mutations[0][-1]
window_size = (len(first_mut_seq_with_coords) - 1) // 2
for m in search_motifs:
for s in strand:
# print("IDENTIFYING MOTIF: ", m['name'])
result = get_enrichment(mutations, m['motif'], m['position'], m['ref'], m['alt'], window_size, s)
debug_data = {'sample': sample, 'motif': m['logo'], 'strand': s}
debug_data.update(result)
debug_string = pprint.pformat(debug_data, indent=4)
logger.debug(debug_string)
if result['mutation_load'] == 0:
continue
motif_dict = {
'sample': sample,
'name': m['name'],
'motif': m['logo'],
'strand': s,
'enrichment': result['enrichment'],
'pvalue': result['pvalue_fisher'],
'mutations_low_est': result['mutation_load'],
'mutations_high_est': result['bases_mutated_in_motif'],
# 'mut_motif': result['bases_mutated_in_motif'],
# 'mut_not_in_motif': result['bases_mutated_not_in_motif'],
# 'stat_count': result['bases_not_mutated_in_motif'],
# 'ref_count': result['bases_not_mutated_not_in_motif']
}
motif_matches.append(motif_dict)
except AttributeError:
# print(sample, len(mutations))
if samples_mutations is not None and len(samples_mutations) > 0:
first_mut_seq_with_coords = samples_mutations[0][-1]
window_size = (len(first_mut_seq_with_coords) - 1) // 2
for m in search_motifs:
for s in strand:
# print("IDENTIFYING MOTIF: ", m['name'])
result = get_enrichment(samples_mutations, m['motif'], m['position'], m['ref'], m['alt'], window_size, s)
debug_data = {'sample': "VCF", 'motif': m['logo'], 'strand': s}
debug_data.update(result)
debug_string = pprint.pformat(debug_data, indent=4)
logger.debug(debug_string)
if result['mutation_load'] == 0:
continue
motif_dict = {
'sample': "VCF",
'name': m['name'],
'motif': m['logo'],
'strand': s,
'enrichment': result['enrichment'],
'pvalue': result['pvalue_fisher'],
'mutations_low_est': result['mutation_load'],
'mutations_high_est': result['bases_mutated_in_motif'],
# 'mut_motif': result['bases_mutated_in_motif'],
# 'mut_not_in_motif': result['bases_mutated_not_in_motif'],
# 'stat_count': result['bases_not_mutated_in_motif'],
# 'ref_count': result['bases_not_mutated_not_in_motif']
}
motif_matches.append(motif_dict)
return motif_matches
in context_window.py:
def read_VCF_with_context_window(muts, asm, window_size):
cn = complementary_nucleotide
mutations = defaultdict(float)
N_skipped = 0
# N_skipped_indels = 0
raw_mutations = []
for line in muts.read().split("\n"):
if line.startswith("#"):
continue
if len(line) < 10:
continue
col_list = line.split()
if len(col_list) < 4:
continue
# ID = col_list[2]
# chromosome is expected to be one or two number or one letter
chrom = col_list[0] # VCF CHROM
if chrom.lower().startswith("chr"):
chrom = chrom[3:]
# if len(chrom) == 2 and chrom[1] not in "0123456789":
# chrom = chrom[0]
pos = int(col_list[1]) # VCF POS
x = col_list[3] # VCF REF
y = col_list[4] # VCF ALT
# if multiple REF or ALT alleles are given, ignore mutation entry (could mean seq error, could mean deletion)
if len(x) != 1:
N_skipped += 1
continue
if len(y) != 1:
N_skipped += 1
continue
transcript_strand = '+' # bad assumption about transcript strand
raw_mutations.append((chrom, pos, transcript_strand, x, y))
# print("RAW", raw_mutations)
# print("INDELS", N_skipped)
mutations_with_context = []
if len(raw_mutations) > 0:
contexts = get_context_twobit_window(raw_mutations, asm, window_size)
if contexts is None or len(contexts) == 0:
return None, None
for (chrom, pos, transcript_strand, x, y) in raw_mutations:
(p5, p3), seq_with_coords = contexts.get((chrom, pos), (("N", "N"), []))
# print("RESULT: {} {}".format(p5, p3))
if len(set([p5, x, y, p3]) - set(nucleotides)) > 0:
# print(chrom, pos, p5, p3, x)
# print("Skipping invalid nucleotides")
N_skipped += 1
continue
if x in "CT":
mutations[p5 + p3 + x + y] += 1.0
else:
# complementary mutation
mutations[cn[p3] + cn[p5] + cn[x] + cn[y]] += 1.0
mutations_with_context.append((chrom, pos, transcript_strand, x, y, seq_with_coords))
N_loaded = int(sum(mutations.values()))
processing_stats = {'loaded': N_loaded, 'skipped': N_skipped, 'format': 'VCF'}
return mutations, mutations_with_context, processing_stats