ahrefs/atd

atdgen: menhir error

aryx opened this issue · 6 comments

aryx commented

With this file:

(* New Semgrep syntax (hence the v2) specified using ATD instead of jsonschema.
 *
 * For more information on the new syntax, see:
 *  - Brandon's community Slack post announcing the new syntax
 *    https://semgrep.slack.com/archives/C018NJRRCJ0/p1698430726062769?thread_ts=1698350734.415849&cid=C018NJRRCJ0
 *  - Brandon's slides
 *    https://docs.google.com/presentation/d/1zzmyFbfNlJqweyzuuFlo4zpSs3Gqhfi6FiNRONSEQ0E/edit#slide=id.g1eee710cdbf_0_26
 *  - Pieter's video
 *    https://www.youtube.com/watch?v=dZUPjFvknnI
 *  - Parsia's blog post
 *    https://parsiya.net/blog/2023-10-28-semgreps-experimental-rule-syntax/
 *
 * Note that even if most Semgrep users use YAML to write a rule, and not JSON,
 * we still use a JSON tool (here ATD, but also jsonschema) to specify
 * the rule schema because YAML is a superset of JSON and can be
 * mechanically translated into JSON; there is no yamlschema
 * (see https://json-schema-everywhere.github.io/yaml). To add even more
 * confusion, a jsonschema can actually be specified using YAML (like in
 * rule_shema_v1.yml), and so one can use YAML syntax to specify the
 * JSON schema of files actually written in YAML (hmmm).
 *
 * Jsonschema is powerful but also arguably complicated and so it
 * might be simpler for many Semgrep developers (and also some Semgrep
 * users) to use ATD to specify and understand the schema of a rule.
 * It could provide a better basis to think about future syntax extensions.
 *
 * This file is now also used for some rule validation in
 * `semgrep --validate --develop`.
 *
 * Note that this file does not replace Parse_rule.ml nor Rule.ml. We still
 * want to accept the old syntax in Parse_rule.ml and also parse with
 * position information and error recovery which ATD does not provide.
 * This files does not replace either (yet) rule_schema_v1.yml which is
 * more complete.
 *
 * TODO:
 *  - taint
 *  - extract
 *  - r2c-internal-project-depends-on-content
 *  - secrets
 *  - steps (and join?)
 *  - generalized taint
 *  - new metavariable types
 *  - new 'anywhere:'
 *)

(*****************************************************************************)
(* Basic types and string aliases *)
(*****************************************************************************)

(* escape hatch *)
type raw_json <ocaml module="Yojson.Basic" t="t"> = abstract

(* ex: "*.c" *)
type glob = string

(* ex: "[a-zA-Z_]*\\.c" *)
type regex = string

(*****************************************************************************)
(* The rule *)
(*****************************************************************************)

type rule = {
     id: rule_id;

     message: string;
     severity: severity;

     (* TODO: selector vs analyzer *)
     languages: language list;

     (* CHECK: exactly one of those fields must be set *)
     ?match_ <json name="match">: formula option;
     ?taint: taint_spec option;
     ?extract: extract option;
     (* TODO: join, steps, secrets, sca *)

     ~mode <ocaml default="`Search">: mode;
     (* TODO: product: product *)

     (* TODO? could be replaced by a pattern-filename: *)
     ?paths: paths option;

     ?fix: string option;
     ?fix_regex: fix_regex option;
     
     ?metadata: raw_json option;
     ?options: rule_options option;

     ?version: version option;
     ?min_version: version option;
     ?max_version: version option;

     (* later: equivalences: ... *)
}

(* Rule_ID.t, "^[a-zA-Z0-9._-]*$" *)
type rule_id = string wrap <ocaml module="Rule_ID">

(* Version_info.t *)
type version = string (* TODO  wrap <ocaml module="ATDStringWrap.Version"> *)

type mode = [
  | Search <json name="search">
  | Taint <json name="taint">
  | Join <json name="join">
  | Extract <json name="extract">
  | SemgrepInternalPostprocessor <json name="semgrep_internal_postprocessor">
  (* TODO: Steps, SCA? *)
]

(*****************************************************************************)
(* Types of rule fields *)
(*****************************************************************************)

(* coupling: semgrep_output_v1.atd with match_severity *)
type severity = [
  | Error <json name="ERROR">
  | Warning <json name="WARNING">
  | Info <json name="INFO">
  (* should not be used *)
  | Experiment <json name="EXPERIMENT">
  | Inventory <json name="INVENTORY">
]

(* coupling: language.ml *)
type language = [
  (* programming (and configuration) languages *)
  | Apex <json name="apex">
  | Bash <json name="bash">
  | Sh <json name="sh">
  | C <json name="c">
  | Clojure <json name="clojure">
  | Cpp <json name="cpp">
  | CppSymbol <json name="c++">
  | Csharp <json name="csharp">
  | CsharpSymbol <json name="c#">
  | Dart <json name="dart">
  | Dockerfile <json name="dockerfile">
  | Docker <json name="docker">
  | Ex <json name="ex">
  | Elixir <json name="elixir">
  | Generic <json name="generic">
  | Go <json name="go">
  | Golang <json name="golang">
  | Hack <json name="hack">
  | Html <json name="html">
  | Java <json name="java">
  | Js <json name="js">
  | Javascript <json name="javascript">
  | Json <json name="json">
  | Jsonnet <json name="jsonnet">
  | Julia <json name="julia">
  | Kt <json name="kt">
  | Kotlin <json name="kotlin">
  | Lisp <json name="lisp">
  | Lua <json name="lua">
  | Ocaml <json name="ocaml">
  | Php <json name="php">
  | Python2 <json name="python2">
  | Python3 <json name="python3">
  | Py <json name="py">
  | Python <json name="python">
  | R <json name="r">
  | Ruby <json name="ruby">
  | Rust <json name="rust">
  | Scala <json name="scala">
  | Scheme <json name="scheme">
  | Solidity <json name="solidity">
  | Sol <json name="sol">
  | Swift <json name="swift">
  | Tf <json name="tf">
  | Hcl <json name="hcl">
  | Terraform <json name="terraform">
  | Ts <json name="ts">
  | Typescript <json name="typescript">
  | Vue <json name="vue">
  | Yaml <json name="yaml">

  (* not regular programming languages *)
  | Regex <json name="regex">
  | None <json name="none">
]

type paths = {
  ~include_ <json name="include">: glob list;
  ~exclude: glob list;
}

type fix_regex = {
  regex: regex;
  replacement: string;
  ?count: int option;
}

type rule_options <ocaml from="Rule_options" t="t"> = abstract

(*****************************************************************************)
(* Search mode (default) and formula *)
(*****************************************************************************)

(* 'formula' below is handled by a <json adapter.ocaml=...> because there is no
 * way to encode directly using ATD the way we chose to represent formulas
 * in YAML/JSON.
 *
 * old: this type was called new-pattern in rule_schema_v1.yaml
 *)

type formula = {
  (* CHECK: exactly one of those fields must be set *)
  (* either directly a string or pattern: string in the JSON *)
  ?pattern: string option;
  ?regex: regex option;
  ?all: formula list option;
  ?any: formula list option;
  (* check: not/inside/anywhere can appear only inside an all: *)
  ?not: formula option;
  ?inside: formula option;
  ?anywhere: formula option;
  (* TODO? Taint of taint_spec *)

  (* alt: we could instead do '?all: formula list option * condition list'
   * above, but syntactically we also allow 'where' with pattern:, regex:,
   * etc. as in
   *    { pattern: ..., where: ..., }
   * In fact that's the main reason we sometimes have to use pattern: string
   * instead of a string because where: could not be attached to it.
   *)
  ~where: condition list;
}
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">

(* Just like for formula, we're using an adapter to transform
 * conditions in YAML like:
 *
 *  where:
 *   - metavariable: $X
 *     regex: $Z
 *
 * which when turned into JSON gives:
 *
 *  { where: [
 *     { metavariable: $X,
 *       regex: $Z
 *     }
 *   ] }
 * 
 * which we must transform in an ATD-compliant:
 *
 *  [ ["M", [{ metavariable: $X,
 *             regex: $Z
 *           }]
 *    ]]
 *)
type condition = [
  | Focus <json name="F"> of focus
  | Comparison <json name="C"> of comparison
  | Metavariable <json name="M"> of metavariable_cond
  ]
<json adapter.ocaml="Rule_schema_v2_adapter.Condition">

type focus = {
  (* either a single string or an array in JSON, that is
   * {focus: "$FOO"}, but also {focus: ["$FOO", "$BAR"]}
   *)
  focus: mvar list;
}

type mvar = string

type comparison = {
    comparison: string; (* expr *)
    ?base: int option;
    ~strip: bool;
  }

type metavariable_cond = {
  metavariable: mvar;
  (* CHECK: exactly one of those fields must be set *)
  ?type: string option;
  ?types: string list option;
  (* this covers regex:, pattern:, but also any formula.
   * TODO: for metavariable-regex, can also enable constant_propagation 
   * TOOD: we should accept also language: string
   *)
  ?analyzer: analyzer option;
}  

type analyzer = [
  | Entropy <json name="entropy">
  | Redos <json name="redos">
]

(*****************************************************************************)
(* Taint mode *)
(*****************************************************************************)

type taint_spec = raw_json

(*****************************************************************************)
(* Extract mode *)
(*****************************************************************************)

type extract = raw_json

(*****************************************************************************)
(* Toplevel *)
(*****************************************************************************)

type rules = {
  rules: rule list;

  (* Missed count of pro rules when not logged-in.
   * Sent by the registry to the CLI since 1.48.
   * See https://github.com/semgrep/semgrep-app/pull/11142
   *)
  ?missed: int option;
}

atdgen rule_schema_v2.atd generates:

Fatal error: exception Atd.Parser.MenhirBasics.Error
Raised at Atd__Parser.MenhirBasics._eRR in file "atd/src/parser.ml" (inlined), line 8, characters 6-17
Called from Atd__Parser._menhir_run_043 in file "atd/src/parser.ml", line 2517, characters 10-17
Called from Atd__Parser.full_module in file "atd/src/parser.ml" (inlined), line 3593, characters 34-92
Called from Atd__Util.read_lexbuf in file "atd/src/util.ml", line 14, characters 19-56
Called from Atd__Util.load_file in file "atd/src/util.ml", line 64, characters 6-148
Re-raised at Atd__Util.load_file in file "atd/src/util.ml", line 72, characters 4-11
Called from Atdgen_emit__Ob_emit.make_ocaml_files in file "atdgen/src/ob_emit.ml", line 1364, characters 8-164
Called from Dune__exe__Ag_main in file "atdgen/bin/ag_main.ml", line 428, characters 6-13
Re-raised at Dune__exe__Ag_main in file "atdgen/bin/ag_main.ml", line 435, characters 11-18

instead of a clear parsing error.

aryx commented

it was because of the ?type: string option

aryx commented

still, a better error message would be nice.

aryx commented

Low priority.

This happens because type is a keyword. I think it's a matter of adding a case for handling the error in the menhir file. Here's a minimal atd file with this error:

$ cat bug.atd
type t = { type: string }
$ atdgen bug.atd
Fatal error: exception Atd.Parser.MenhirBasics.Error

(btw, I don't know why I'm not getting a stack trace. I'm using atdgen 2.11.0 as shipped by opam 2.1.0 with ocaml 4.14.0)