move_hom_ref_calls python error
james-lawlor opened this issue · 1 comments
james-lawlor commented
Hi,
Perhaps the move_hom_ref_calls option isn't fully implemented yet, but I saw it in the options for the latest branch and thought it sounded useful. However, when running vcf_to_bq with that option, I'm getting a Nonetype is not iterable stemming from line 163 in bigquery_row_generator.py.
Running branch eb51fdd from October 30, 2020.
Without that option, variants load into bigquery as expected.
Relevant selection of output:
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-04-29T19:52:17.449Z: JOB_MESSAGE_BASIC: Executing operation VariantToAvrochr5/WriteToAvroFiles/Write/WriteImpl/GroupByKey/Read+VariantToAvrochr5/WriteToAvroFiles/Write/WriteImpl/GroupByKey/GroupByWindow+VariantToAvrochr5/WriteToAvroFiles/Write/WriteImpl/Extract
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-04-29T19:52:19.499Z: JOB_MESSAGE_ERROR: Traceback (most recent call last):
File "apache_beam/runners/common.py", line 1213, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 569, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1344, in apache_beam.runners.common._OutputProcessor.process_outputs
File "/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/libs/bigquery_row_generator.py", line 228, in get_rows
variant, allow_incompatible_records)
File "/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/libs/bigquery_row_generator.py", line 280, in _get_base_row_from_variant
variant, allow_incompatible_records)
File "/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/libs/bigquery_row_generator.py", line 163, in _get_variant_meta_record
for name, encoded_name in variant.hom_ref_calls:
TypeError: 'NoneType' object is not iterable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 649, in do_work
work_executor.execute()
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 179, in execute
op.start()
File "dataflow_worker/shuffle_operations.py", line 63, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 64, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 79, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 80, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 84, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "apache_beam/runners/worker/operations.py", line 332, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 195, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "dataflow_worker/shuffle_operations.py", line 261, in dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
File "dataflow_worker/shuffle_operations.py", line 268, in dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
File "apache_beam/runners/worker/operations.py", line 332, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 195, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 670, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 671, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1215, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1279, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1213, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 569, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1371, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 195, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 670, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 671, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1215, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1279, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1213, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 569, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1371, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 195, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 670, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 671, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1215, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1294, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "/usr/local/lib/python3.7/site-packages/future/utils/__init__.py", line 446, in raise_with_traceback
raise exc.with_traceback(traceback)
File "apache_beam/runners/common.py", line 1213, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 569, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1344, in apache_beam.runners.common._OutputProcessor.process_outputs
File "/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/libs/bigquery_row_generator.py", line 228, in get_rows
variant, allow_incompatible_records)
File "/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/libs/bigquery_row_generator.py", line 280, in _get_base_row_from_variant
variant, allow_incompatible_records)
File "/usr/local/lib/python3.7/site-packages/gcp_variant_transforms/libs/bigquery_row_generator.py", line 163, in _get_variant_meta_record
for name, encoded_name in variant.hom_ref_calls:
TypeError: 'NoneType' object is not iterable [while running 'VariantToAvrochr7/ConvertToAvroRecords']
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-04-29T19:52:20.119Z: JOB_MESSAGE_BASIC: Finished operation VariantToAvrochr5/WriteToAvroFiles/Write/WriteImpl/GroupByKey/Read+VariantToAvrochr5/WriteToAvroFiles/Write/WriteImpl/GroupByKey/GroupByWindow+VariantToAvrochr5/WriteToAvroFiles/Write/WriteImpl/Extract
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-04-29T19:52:20.180Z: JOB_MESSAGE_DEBUG: Value "VariantToAvrochr5/WriteToAvroFiles/Write/WriteImpl/Extract.out" materialized.
Command:
COMMAND="python -m gcp_variant_transforms.vcf_to_bq \
--setup_file /gpfs/gpfs1/home/jlawlor/bigquery/gcp-variant-transforms/setup.py \
--project ${GOOGLE_CLOUD_PROJECT} \
--allow_malformed_records \
--region us-west1 \
--temp_location ${TEMP_LOCATION} \
--input_pattern ${INPUT_PATTERN} \
--output_table ${OUTPUT_TABLE} \
--job_name vcf-to-bigquery \
--runner DataflowRunner \
--include_call_name \
--sharding_config_path /gpfs/gpfs1/home/jlawlor/bigquery/gcp-variant-transforms/gcp_variant_transforms/data/sharding_configs/homo_sapiens_default.yaml \
--variant_merge_strategy MOVE_TO_CALLS \
--copy_quality_to_calls \
--copy_filter_to_calls \
--max_num_workers 25 \
--append \
--update_schema_on_append \
--move_hom_ref_calls \
--sample_lookup_optimized_output_table ${SAMPLE_TABLE}"
And input VCF (some headers removed)
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##source=strelka
##source_version=2.9.10
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
##INFO=<ID=BLOCKAVG_min30p3a,Number=0,Type=Flag,Description="Non-variant multi-site block. Non-variant blocks are defined independently for each sample. All si
tes in such a block are constrained to be non-variant, have the same filter value, and have sample values {GQX,DP,DPF} in range [x,y], y <= max(x+3,(x*1.3)).">
##INFO=<ID=SNVHPOL,Number=1,Type=Integer,Description="SNV contextual homopolymer length">
##INFO=<ID=CIGAR,Number=A,Type=String,Description="CIGAR alignment for each alternate indel allele">
##INFO=<ID=RU,Number=A,Type=String,Description="Smallest repeating sequence unit extended or contracted in the indel allele relative to the reference. RUs are
not reported if longer than 20 bases">
##INFO=<ID=REFREP,Number=A,Type=Integer,Description="Number of times RU is repeated in reference">
##INFO=<ID=IDREP,Number=A,Type=Integer,Description="Number of times RU is repeated in indel allele">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="RMS of mapping quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GQX,Number=1,Type=Integer,Description="Empirically calibrated genotype quality score for variant sites, otherwise minimum of {Genotype quality assuming variant position,Genotype quality assuming non-variant position}">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Filtered basecall depth used for site genotyping. In a non-variant multi-site block this value represents the average of all sites in the block.">
##FORMAT=<ID=DPF,Number=1,Type=Integer,Description="Basecalls filtered from input prior to site genotyping. In a non-variant multi-site block this value represents the average of all sites in the block.">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum filtered basecall depth used for site genotyping within a non-variant multi-site block">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.51 or higher that read contains indicated allele vs all other intersecting indel alleles)">
##FORMAT=<ID=ADF,Number=.,Type=Integer,Description="Allelic depths on the forward strand">
##FORMAT=<ID=ADR,Number=.,Type=Integer,Description="Allelic depths on the reverse strand">
##FORMAT=<ID=FT,Number=1,Type=String,Description="Sample filter, 'PASS' indicates that all filters have passed for this sample">
##FORMAT=<ID=DPI,Number=1,Type=Integer,Description="Read depth associated with indel, taken from the site preceding the indel">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set identifier">
##FORMAT=<ID=SB,Number=1,Type=Float,Description="Sample site strand bias">
##FILTER=<ID=IndelConflict,Description="Indel genotypes from two or more loci conflict in at least one sample">
##FILTER=<ID=SiteConflict,Description="Site is filtered due to an overlapping indel call filter">
##FILTER=<ID=LowGQX,Description="Locus GQX is below threshold or not present">
##FILTER=<ID=HighDPFRatio,Description="The fraction of basecalls filtered out at a site is greater than 0.4">
##FILTER=<ID=HighSNVSB,Description="Sample SNV strand bias value (SB) exceeds 10">
##FILTER=<ID=HighDepth,Description="Locus depth is greater than 3x the mean chromosome depth">
##FILTER=<ID=LowDepth,Description="Locus depth is below 3">
##FILTER=<ID=NotGenotyped,Description="Locus contains forcedGT input alleles which could not be genotyped">
##FILTER=<ID=PloidyConflict,Description="Genotype call from variant caller not consistent with chromosome ploidy">
##FILTER=<ID=NoPassedVariantGTs,Description="No samples at this locus pass all sample filters and have a variant genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_id
chr7 117560845 rs10487372 C T 203 PASS SNVHPOL=4;MQ=60 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL 0/1:72:30:37:0:11,26:5,15:6,11:-23.4:PASS:238,0,69
chr7 117562430 rs2283057 C G 160 PASS SNVHPOL=4;MQ=60 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL 0/1:152:30:45:2:20,25:8,13:12,12:-20.2:PASS:195,0,149
chr7 117563520 rs4148707 GA G 185 PASS CIGAR=1M1D;RU=A;REFREP=2;IDREP=1;MQ=60 GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL 0/1:230:27:45:24,15:15,4:9,11:PASS:227,0,293
james-lawlor commented
(This example VCF doesn't have any hom-ref calls, but I was going for a simple reproduction of the error, which I originally encountered trying to load a multi-sample VCF, where using the option would make sense.)