RumbleDB/rumble

Strange issue in IfRuntimeIterator

ingomueller-net opened this issue · 2 comments

I get a strange issue in the following query:

import module namespace hep = "../../common/hep.jq";
declare variable $input-path as anyURI external := anyURI("../../../data/Run2012B_SingleMu-1000.parquet");

declare function sinh($x) {
  (exp($x) - exp(-$x)) div 2.0
};

declare function cosh($x) {
  (exp($x) + exp(-$x)) div 2.0
};

declare function histogram($values, $lo, $hi, $num-bins) {
  let $width := ($hi - $lo) div $num-bins
  let $half-width := $width div 2

  let $underflow := round(($lo - $half-width) div $width)
  let $overflow := round(($hi - $half-width) div $width)

  for $v in $values
  let $bucket-idx :=
    if ($v < $lo) then $underflow
    else
      if ($v > $hi) then $overflow
      else round(($v - $half-width) div $width)
  let $center := $bucket-idx * $width + $half-width

  group by $center
  order by $center
  return {"x": $center, "y": count($v)}
};

declare function concat-leptons($event) {
  let $muons := (
    for $muon in $event.muons[]
    return {| $muon, {"type": "m"}  |}
  )

  let $electrons := (
    for $electron in $event.electrons[]
    return {| $electron, {"type": "e"}  |}
  )

  return ($muons, $electrons)
};

declare function RhoZ-to-eta($rho, $z) {
  let $temp := $z div $rho
  return log($temp + sqrt($temp * $temp + 1.0))
};

declare function PtEtaPhiM-to-PxPyPzE($vect) {
  let $x := $vect.pt * cos($vect.phi)
  let $y := $vect.pt * sin($vect.phi)
  let $z := $vect.pt * sinh($vect.eta)
  let $temp := $vect.pt * cosh($vect.eta)
  let $e := sqrt($temp * $temp + $vect.mass * $vect.mass)
  return {"x": $x, "y": $y, "z": $z, "e": $e}
};

declare function add-PxPyPzE($particle1, $particle2) {
  let $x := $particle1.x + $particle2.x
  let $y := $particle1.y + $particle2.y
  let $z := $particle1.z + $particle2.z
  let $e := $particle1.e + $particle2.e
  return {"x": $x, "y": $y, "z": $z, "e": $e}
};

declare function PxPyPzE-to-PtEtaPhiM($particle) {
  let $x2 := $particle.x * $particle.x
  let $y2 := $particle.y * $particle.y
  let $z2 := $particle.z * $particle.z
  let $e2 := $particle.e * $particle.e

  let $pt := sqrt($x2 + $y2)
  let $eta := RhoZ-to-eta($pt, $particle.z)
  let $phi := if ($particle.x = 0.0 and $particle.y = 0.0)
        then 0.0
        else atan2($particle.y, $particle.x)
  let $mass := sqrt($e2 - $z2 - $y2 - $x2)

  return {"pt": $pt, "eta": $eta, "phi": $phi, "mass": $mass}
};

declare function add-PtEtaPhiM($particle1, $particle2) {
  PxPyPzE-to-PtEtaPhiM(
    add-PxPyPzE(
      PtEtaPhiM-to-PxPyPzE($particle1),
      PtEtaPhiM-to-PxPyPzE($particle2)
      )
    )
};

declare function make-muons($event) {
  for $i in (1 to size($event.Muon_pt))
  return {
    "pt": $event.Muon_pt[[$i]],
    "eta": $event.Muon_eta[[$i]],
    "phi": $event.Muon_phi[[$i]],
    "mass": $event.Muon_mass[[$i]],
    "charge": $event.Muon_charge[[$i]],
    "pfRelIso03_all": $event.Muon_pfRelIso03_all[[$i]],
    "pfRelIso04_all": $event.Muon_pfRelIso04_all[[$i]],
    "tightId": $event.Muon_tightId[[$i]],
    "softId": $event.Muon_softId[[$i]],
    "dxy": $event.Muon_dxy[[$i]],
    "dxyErr": $event.Muon_dxyErr[[$i]],
    "dz": $event.Muon_dz[[$i]],
    "dzErr": $event.Muon_dzErr[[$i]],
    "jetIdx": $event.Muon_jetIdx[[$i]],
    "genPartIdx": $event.Muon_genPartIdx[[$i]]
  }
};

declare function make-electrons($event) {
  for $i in (1 to size($event.Electron_pt))
  return {
    "pt": $event.Electron_pt[[$i]],
    "eta": $event.Electron_eta[[$i]],
    "phi": $event.Electron_phi[[$i]],
    "mass": $event.Electron_mass[[$i]],
    "charge": $event.Electron_charge[[$i]],
    "pfRelIso03_all": $event.Electron_pfRelIso03_all[[$i]],
    "dxy": $event.Electron_dxy[[$i]],
    "dxyErr": $event.Electron_dxyErr[[$i]],
    "dz": $event.Electron_dz[[$i]],
    "dzErr": $event.Electron_dzErr[[$i]],
    "cutBasedId": $event.Electron_cutBasedId[[$i]],
    "pfId": $event.Electron_pfId[[$i]],
    "jetIdx": $event.Electron_jetIdx[[$i]],
    "genPartIdx": $event.Electron_genPartIdx[[$i]]
  }
};

declare function make-jets($event) {
  for $i in (1 to size($event.Jet_pt))
  return {
    "pt": $event.Jet_pt[[$i]],
    "eta": $event.Jet_eta[[$i]],
    "phi": $event.Jet_phi[[$i]],
    "mass": $event.Jet_mass[[$i]],
    "puId": $event.Jet_puId[[$i]],
    "btag": $event.Jet_btag[[$i]]
  }
};

declare function restructure-event($event) {
  let $muons := make-muons($event)
  let $electrons := make-electrons($event)
  let $jets := make-jets($event)
  return {| $event,
           {
              "muons": [ $muons ],
              "electrons": [ $electrons ],
              "jets": [ $jets ]
           }
         |}
};

declare function restructure-data($data) {
  for $event in $data
  return restructure-event($event)
};

declare function restructure-data-parquet($path) {
  for $event in parquet-file($path)
  return restructure-event($event)
};

let $filtered := (
  for $event in restructure-data-parquet($input-path)
  count $c
  where $c eq 3
  where integer($event.nMuon + $event.nElectron) > 2

  let $leptons := concat-leptons($event)
  let $closest-lepton-pair := (
    for $lepton1 at $i in $leptons
    for $lepton2 at $j in $leptons
    where $i < $j
    where $lepton1.type = $lepton2.type and $lepton1.charge != $lepton2.charge
    order by abs(91.2 - add-PtEtaPhiM($lepton1, $lepton2).mass) ascending
    return {"i": $i, "j": $j}
  )[1]

  where exists($closest-lepton-pair)

  return max(
    for $lepton at $i in $leptons
    where $i != $closest-lepton-pair.i and $i != $closest-lepton-pair.j
    return $lepton.pt
  )
)

return histogram($filtered, 15, 60, 100)

To reproduce, you may need to adapt the path of the input file to point to this file.

The error I get is the following one:

21/02/16 13:16:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
log4j:WARN No appenders could be found for logger (org.apache.hadoop.security.UserGroupInformation).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
An error has occured: null
We should investigate this 🙈. Please contact us or file an issue on GitHub with your query.
Link: https://github.com/RumbleDB/rumble/issues
For more debug info (e.g., so you can communicate it to us), please try again using --show-error-info yes in your command line.
java.lang.NullPointerException
        at org.rumbledb.runtime.control.IfRuntimeIterator.closeLocal(IfRuntimeIterator.java:105)
        at org.rumbledb.runtime.HybridRuntimeIterator.close(HybridRuntimeIterator.java:84)
        at org.rumbledb.runtime.RuntimeIterator.lambda$0(RuntimeIterator.java:187)
        at java.util.ArrayList.forEach(ArrayList.java:1257)
        at org.rumbledb.runtime.RuntimeIterator.close(RuntimeIterator.java:187)
        at org.rumbledb.runtime.HybridRuntimeIterator.close(HybridRuntimeIterator.java:82)
        at org.rumbledb.runtime.RuntimeIterator.materialize(RuntimeIterator.java:274)
        at org.rumbledb.runtime.flwor.udfs.LetClauseUDF.call(LetClauseUDF.java:57)
        at org.rumbledb.runtime.flwor.udfs.LetClauseUDF.call(LetClauseUDF.java:1)
        at org.apache.spark.sql.UDFRegistration.$anonfun$register$283(UDFRegistration.scala:747)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.project_doConsume_0$(Unknown Source)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.sort_addToSorter_0$(Unknown Source)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
        at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
        at org.apache.spark.sql.execution.aggregate.SortAggregateExec.$anonfun$doExecute$2(SortAggregateExec.scala:80)
        at org.apache.spark.sql.execution.aggregate.SortAggregateExec.$anonfun$doExecute$2$adapted(SortAggregateExec.scala:77)
        at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:859)
        at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:859)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
        at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
        at org.apache.spark.scheduler.Task.run(Task.scala:127)
        at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

The issue is strange because if I change basically anything in the query, the error goes away. I have thus not succeeded in producing a shorter test case.

Thanks for reporting @ingomueller-net

It's a bug that happens rarely, good catch.

The problem was a Higgs boson that decayed into four leptons. I just adjusted the curvature of spacetime a bit and it now works.

OK, it works. I can see the Higgs boson now!