data-model-generator

Data model generator based on Scala case classes.

Goals
Getting started
Dialects
Installers
Extracting class metadata
Customizations

Goals

Generate data model (e.g. DDL, avro schema, Elasticsearch mapping) based on Scala case classes

Getting started

Include dependency:

"com.github.piotr-kalanski" % "data-model-generator_2.11" % "0.8.1"

<dependency>
    <groupId>com.github.piotr-kalanski</groupId>
    <artifactId>data-model-generator_2.11</artifactId>
    <version>0.8.1</version>
</dependency>

Dialects

H2 dialect

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

object H2Example extends App {
  println(DataModelGenerator.generate[Book](dialects.H2Dialect))
}

CREATE TABLE Book(
   title VARCHAR,
   year INT,
   owner OTHER,
   authors ARRAY
);

Hive dialect

import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

object HiveExample extends App {
  println(DataModelGenerator.generate[Book](new HiveGenerator()))
}

CREATE TABLE Book(
   title STRING,
   year INT,
   owner STRUCT<name : STRING, age : INT>,
   authors ARRAY<STRUCT<name : STRING, age : INT>>
);

Redshift dialect

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

object RedshiftExample extends App {
  println(DataModelGenerator.generate[Book](dialects.RedshiftDialect))
}

CREATE TABLE Book(
   title VARCHAR,
   year INTEGER,
   owner VARCHAR,
   authors VARCHAR
);

MySQL dialect

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

object MySQLExample extends App {
  println(DataModelGenerator.generate[Book](dialects.MySQLDialect))
}

CREATE TABLE Book(
   title VARCHAR,
   year INTEGER,
   owner JSON,
   authors JSON
);

Avro schema dialect

Avro schema

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

DataModelGenerator.generate[Book](dialects.AvroSchemaDialect)

{
   "namespace": "com.datawizards.dmg.examples",
   "type": "record",
   "name": "Book",
   "fields": [
      {"name": "title", "type": "string"},
      {"name": "year", "type": "int"},
      {"name": "owner", "type": "record", "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}]},
      {"name": "authors", "type": "array", "items": {"type": "record", "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}]}}
   ]
}

Avro schema for Avro Schema Registry

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int, skills: Seq[String])

DataModelGenerator.generate[Person](dialects.AvroSchemaRegistryDialect)

{"schema":
"{
   \"namespace\": \"com.datawizards.dmg.examples\",
   \"type\": \"record\",
   \"name\": \"Person\",
   \"fields\": [
      {\"name\": \"name\", \"type\": \"string\"},
      {\"name\": \"age\", \"type\": \"int\"},
      {\"name\": \"skills\", \"type\": \"array\", \"items\": \"string\"}
   ]
}"
}

Elasticsearch dialect

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

DataModelGenerator.generate[Book](dialects.ElasticsearchDialect)

{
   "mappings" : {
      "Book" : {
         "properties" : {
            "title" : {"type" : "string"},
            "year" : {"type" : "integer"},
            "owner" : {
               "properties" : {
                  "name" : {"type" : "string"},
                  "age" : {"type" : "integer"}
               }
            },
            "authors" : {
               "properties" : {
                  "name" : {"type" : "string"},
                  "age" : {"type" : "integer"}
               }
            }
         }
      }
   }
}

Java dialect

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.JavaDialect)

public class Person {
   private String name;
   private Integer age;

   public Person() {}

   public Person(String name, Integer age) {
      this.name = name;
      this.age = age;
   }

   public String getName() {
      return name;
   }

   public void setName(String name) {
      this.name = name;
   }

   public Integer getAge() {
      return age;
   }

   public void setAge(Integer age) {
      this.age = age;
   }
}

Installers

Library enables installing generated data model at target data store e.g. registering generated avro schema at Avro Schema Registry, creating Elasticsearch index or creating Hive table.

Register Avro schema to Avro schema registry

import com.datawizards.dmg.service.AvroSchemaRegistryServiceImpl

case class Person(name: String, age: Int)

object RegisterAvroSchema extends App {
  val service = new AvroSchemaRegistryServiceImpl("http://localhost:8081")
  service.registerSchema[Person]("person")

  println("Subjects:")
  println(service.subjects())

  println("Registered schema:")
  println(service.fetchSchema("person"))
}

"Subjects:"
["person"]
"Registered schema:"
{"type":"record","name":"Person","namespace":"com.datawizards.dmg.examples","fields":[{"name":"name","type":"string"},{"name":"age","type":"int"}]}

Copy Avro schema to HDFS

import com.datawizards.dmg.service.AvroSchemaRegistryServiceImpl

case class Person(name: String, age: Int)

object CopyAvroSchemaToHDFS extends App {
  val service = new AvroSchemaRegistryServiceImpl("http://localhost:8081")
  service.copyAvroSchemaToHdfs[Person]("/metadata/schemas/person")
}

Create Elasticsearch index

import com.datawizards.dmg.service.ElasticsearchServiceImpl

case class Person(name: String, age: Int)

object CreateElasticsearchIndex extends App {
  val service = new ElasticsearchServiceImpl("http://localhost:9200")
  service.createIndex[Person]("person")

  println("Index:")
  println(service.getIndexSettings("person"))
}

Create Elasticsearch template

import com.datawizards.dmg.examples.TestModel.PersonWithMultipleEsAnnotations
import com.datawizards.dmg.service.ElasticsearchServiceImpl

object CreateElasticsearchTemplate extends App {
  val service = new ElasticsearchServiceImpl("http://localhost:9200")
  service.updateTemplate[PersonWithMultipleEsAnnotations]("people")

  println("Template:")
  println(service.getTemplate("people"))
}

Create Hive table

import com.datawizards.dmg.service.HiveServiceImpl

case class Person(name: String, age: Int)

HiveServiceImpl.createHiveTable[Person]()

Extracting class metadata

To extract class metadata you can use method MetaDataWithDialectExtractor.extractClassMetaDataForDialect. Example:

import com.datawizards.dmg.dialects
import com.datawizards.dmg.dialects.MetaDataWithDialectExtractor

case class Person(name: String, age: Int)

MetaDataWithDialectExtractor.extractClassMetaDataForDialect[Person](Some(dialects.HiveDialect))

Customizations

Custom column name

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(
  @column(name="personName")
  name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)

CREATE TABLE Person(
   personName VARCHAR,
   age INT
);

Custom column name specific for dialect

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(
  @column(name="NAME")
  @column(name="personName", dialects.ElasticsearchDialect)
  name: String,
  @column(name="AGE")
  @column(name="personAge", dialects.ElasticsearchDialect)
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)

CREATE TABLE PEOPLE(
   NAME VARCHAR,
   AGE INT
);

{
   "mappings" : {
      "person" : {
         "personName" : {"type" : "string"},
         "personAge" : {"type" : "integer"}
      }
   }
}

Custom table name

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}

@table("PEOPLE")
case class Person(
  name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)

CREATE TABLE PEOPLE(
   name VARCHAR,
   age INT
);

Custom table name specific for dialect

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}

@table("PEOPLE")
@table("person", dialects.ElasticsearchDialect)
case class Person(
  name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)

CREATE TABLE PEOPLE(
   name VARCHAR,
   age INT
);

{
   "mappings" : {
      "person" : {
         "name" : {"type" : "string"},
         "age" : {"type" : "integer"}
      }
   }
}

Placeholders

data-model-generator supports placeholder variables when generating data model. Placeholder variables can be used in any annotation.

Example use case for placeholder variables is to use them for generating table name dependent on environment. For example, each environment has dedicated DB schema e.g. development, uat, production.

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.service.TemplateHandler
import com.datawizards.dmg.{DataModelGenerator, dialects}

@table("${environment}.people")
case class Person(
    name: String,
    age: Int
)

TemplateHandler.inflate(DataModelGenerator.generate[Person](dialects.H2Dialect), Map("environment" -> "development"))

TemplateHandler.inflate(DataModelGenerator.generate[Person](dialects.H2Dialect), Map("environment" -> "production"))

Generates:

CREATE TABLE development.people(
   name VARCHAR,
   age INT
);

CREATE TABLE production.people(
   name VARCHAR,
   age INT
);

Documentation comments

import com.datawizards.dmg.annotations._

@comment("People data")
case class PersonWithComments(
    @comment("Person name") name: String,
    age: Int
)

H2

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}

DataModelGenerator.generate[PersonWithComments](dialects.H2Dialect)

CREATE TABLE PersonWithComments(
   name VARCHAR COMMENT 'Person name',
   age INT
);
COMMENT ON TABLE PersonWithComments IS 'People data';

Hive

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.DataModelGenerator

DataModelGenerator.generate[PersonWithComments](new HiveGenerator)

CREATE TABLE PersonWithComments(
   name STRING COMMENT 'Person name',
   age INT
)
COMMENT 'People data';

Redshift

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}

DataModelGenerator.generate[PersonWithComments](dialects.RedshiftDialect)

CREATE TABLE PersonWithComments(
   name VARCHAR,
   age INTEGER
);
COMMENT ON TABLE PersonWithComments IS 'People data';
COMMENT ON COLUMN PersonWithComments.name IS 'Person name';

Avro schema

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}

DataModelGenerator.generate[PersonWithComments](dialects.AvroSchemaDialect)

{
   "namespace": "com.datawizards.dmg.examples",
   "type": "record",
   "name": "PersonWithComments",
   "doc": "People data",
   "fields": [
      {"name": "name", "type": "string", "doc": "Person name"},
      {"name": "age", "type": "int"}
   ]
}

Column length

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}


case class Person(
  @length(1000) name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)

CREATE TABLE PEOPLE(
   name VARCHAR(1000),
   age INT
);

Not null

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(
  @notNull name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.RedshiftDialect)
DataModelGenerator.generate[Person](dialects.AvroSchemaDialect)

H2 - not null

CREATE TABLE PersonWithNull(
   name VARCHAR NOT NULL,
   age INT
);

Redshift - not null

CREATE TABLE PersonWithNull(
   name VARCHAR NOT NULL,
   age INTEGER
);

Avro schema - not null

{
   "namespace": "com.datawizards.dmg",
   "type": "record",
   "name": "PersonWithNull",
   "fields": [
      {"name": "name", "type": "string"},
      {"name": "age", "type": ["null", "int"]}
   ]
}

Underscore

Convert table and column names for selected dialect to underscore convention.

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.{DataModelGenerator, dialects}

@underscore(dialect=dialects.H2Dialect)
case class PersonWithUnderscore(
    personName: String,
    personAge: Int
)

CREATE TABLE person_with_underscore(
   person_name VARCHAR,
   person_age INT
);

Hive customizations

Hive external table

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

@hiveExternalTable(location="hdfs:///data/people")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](new HiveGenerator)

CREATE EXTERNAL TABLE Person(
   name STRING,
   age INT
)
LOCATION 'hdfs:///data/people';

Hive ROW FORMAT SERDE

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

@hiveRowFormatSerde(format="org.apache.hadoop.hive.serde2.avro.AvroSerDe")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](new HiveGenerator)

CREATE TABLE Person(
   name STRING,
   age INT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe';

Hive STORED AS

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

@hiveStoredAs(format="PARQUET")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](new HiveGenerator)

CREATE TABLE Person(
   name STRING,
   age INT
)
STORED AS PARQUET;

Hive TABLE PROPERTIES

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](new HiveGenerator)

CREATE TABLE Person(
   name STRING,
   age INT
)
TBLPROPERTIES(
   'key1' = 'value1',
   'key2' = 'value2',
   'key3' = 'value3'
);

Hive avro schema url property

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

@hiveTableProperty("avro.schema.url", "hdfs:///metadata/person.avro")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](new HiveGenerator)

If "avro.schema.url" table property is provided then generated data model doesn't have any columns definitions, because they are taken by Hive from avro schema.

CREATE TABLE Person
TBLPROPERTIES(
   'avro.schema.url' = 'hdfs:///metadata/person.avro'
);

Hive partition columns

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

case class ClicksPartitioned(
    time: java.sql.Timestamp,
    event: String,
    user: String,
    @hivePartitionColumn
    year: Int,
    @hivePartitionColumn
    month: Int,
    @hivePartitionColumn
    day: Int
)

DataModelGenerator.generate[ClicksPartitioned](new HiveGenerator)

CREATE TABLE ClicksPartitioned(
   time TIMESTAMP,
   event STRING,
   user STRING
)
PARTITIONED BY(year INT, month INT, day INT);

Hive partition columns - order

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

case class ClicksPartitionedWithOrder(
    time: java.sql.Timestamp,
    event: String,
    user: String,
    @hivePartitionColumn(order=3)
    day: Int,
    @hivePartitionColumn(order=1)
    year: Int,
    @hivePartitionColumn(order=2)
    month: Int
)

DataModelGenerator.generate[ClicksPartitionedWithOrder](new HiveGenerator)

CREATE TABLE ClicksPartitionedWithOrder(
   time TIMESTAMP,
   event STRING,
   user STRING
)
PARTITIONED BY(year INT, month INT, day INT);

Hive Parquet table with many annotations

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

@table("CUSTOM_TABLE_NAME")
@comment("Table comment")
@hiveStoredAs(format="PARQUET")
@hiveExternalTable(location="hdfs:///data/table")
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class ParquetTableWithManyAnnotations(
    @column("eventTime")
    @comment("Event time")
    time: java.sql.Timestamp,
    @comment("Event name")
    event: String,
    @comment("User id")
    user: String,
    @hivePartitionColumn(order=3)
    day: Int,
    @hivePartitionColumn(order=1)
    year: Int,
    @hivePartitionColumn(order=2)
    month: Int
)

DataModelGenerator.generate[ParquetTableWithManyAnnotations](new HiveGenerator)

DROP TABLE IF EXISTS CUSTOM_TABLE_NAME;
CREATE EXTERNAL TABLE CUSTOM_TABLE_NAME(
   eventTime TIMESTAMP COMMENT 'Event time',
   event STRING COMMENT 'Event name',
   user STRING COMMENT 'User id'
)
COMMENT 'Table comment'
PARTITIONED BY(year INT, month INT, day INT)
STORED AS PARQUET
LOCATION 'hdfs:///data/table'
TBLPROPERTIES(
   'key1' = 'value1',
   'key2' = 'value2',
   'key3' = 'value3'
);

Hive Avro table with many annotations

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.{DataModelGenerator, dialects}

@table("CUSTOM_TABLE_NAME")
@comment("Table comment")
@hiveRowFormatSerde(format="org.apache.hadoop.hive.serde2.avro.AvroSerDe")
@hiveStoredAs("INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'")
@hiveExternalTable(location="hdfs:///data/table")
@hiveTableProperty("avro.schema.url", "hdfs:///metadata/table.avro")
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class AvroTableWithManyAnnotations(
    @column("eventTime")
    @comment("Event time")
    time: java.sql.Timestamp,
    @comment("Event name")
    event: String,
    @comment("User id")
    user: String,
    @hivePartitionColumn(order=3)
    day: Int,
    @hivePartitionColumn(order=1)
    year: Int,
    @hivePartitionColumn(order=2)
    month: Int
)

DataModelGenerator.generate[AvroTableWithManyAnnotations](new HiveGenerator)

DROP TABLE IF EXISTS CUSTOM_TABLE_NAME;
CREATE EXTERNAL TABLE CUSTOM_TABLE_NAME
COMMENT 'Table comment'
PARTITIONED BY(year INT, month INT, day INT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION 'hdfs:///data/table'
TBLPROPERTIES(
   'avro.schema.url' = 'hdfs:///metadata/table.avro',
   'key1' = 'value1',
   'key2' = 'value2',
   'key3' = 'value3'
);

Skip table generation if it is unchanged

Sometimes a case class (one of many case classes) is not modified. Also it takes long to drop that table and re-create it (because it contains many partitions and many files). In such case we can override HiveGenerator and add custom logic that fetches table property from Hive metastore. Then, that property is compared against a hash calculated from case class metadata. This guaranetees that once any property is changed, table is re-created. If case class remains unchanged from previous table creation, then the code generated is being commented out.

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.hive._
import com.datawizards.dmg.generator.HiveGenerator
import com.datawizards.dmg.metadata.ClassTypeMetaData
import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](new HiveGenerator(){
        override def getHashFromTableDefinition(metadata: ClassTypeMetaData): Option[Long] = {
          // TODO: connect to Hive and fetch table property named MODEL_GENERATOR_METADATA_HASH
          Some(877255039)
        }
      })

In case getHashFromTableDefinition returns 877255039, code generated by this is:

--Not re-creating table for class Person because it was not modified.
--CREATE TABLE Person(
--  name STRING,
--  age INT
--)
--TBLPROPERTIES(   'MODEL_GENERATOR_METADATA_HASH' = '877255039')
--;

In case getHashFromTableDefinition returns something different than 877255039, code generated by this is (so not re-creating the table):

 CREATE TABLE Person(
   name STRING,
   age INT
)
TBLPROPERTIES(   'MODEL_GENERATOR_METADATA_HASH' = '877255039')
;

Elasticsearch customizations

index settings

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.es._
import com.datawizards.dmg.{DataModelGenerator, dialects}


@esSetting("number_of_shards", 1)
@esSetting("number_of_replicas", 3)
@esSetting("blocks.read_only", true)
@esSetting("codec", "best_compression")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)

{
   "settings" : {
      "number_of_shards" : 1,
      "number_of_replicas" : 3,
      "blocks.read_only" : "true",
      "codec" : "best_compression"
   },
   "mappings" : {
      "Person" : {
         "properties" : {
            "name" : {"type" : "string"},
            "age" : {"type" : "integer"}
         }
      }
   }
}

index parameter

Index parameter: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-index.html

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.es._
import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(
    @esIndex("not_analyzed") name: String,
    age: Int
)

DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)

{
   "mappings" : {
      "PersonEsIndexSettings" : {
         "properties" : {
            "name" : {"type" : "string", "index" : "not_analyzed"},
            "age" : {"type" : "integer"}
         }
      }
   }
}

format parameter

Date format parameter: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.es._
import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(
    name: String,
    @esFormat("yyyy-MM-dd") birthday: Date
)

DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)

{
   "mappings" : {
      "Person" : {
         "properties" : {
            "name" : {"type" : "string"},
            "birthday" : {"type" : "date", "format" : "yyyy-MM-dd"}
         }
      }
   }
}

Elasticsearch template

https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-templates.html

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.es._
import com.datawizards.dmg.{DataModelGenerator, dialects}

@esTemplate("people*")
case class PersonWithEsTemplate(name: String, age: Int)

DataModelGenerator.generate[PersonWithEsTemplate](dialects.ElasticsearchDialect)

{
   "template" : "people*",
   "mappings" : {
      "PersonWithEsTemplate" : {
         "properties" : {
            "name" : {"type" : "string"},
            "age" : {"type" : "integer"}
         }
      }
   }
}

Elasticsearch multiple annotations

import com.datawizards.dmg.annotations._
import com.datawizards.dmg.annotations.es._
import com.datawizards.dmg.{DataModelGenerator, dialects}

@table("people")
@esTemplate("people*")
@esSetting("number_of_shards", 1)
@esSetting("number_of_replicas", 3)
case class PersonWithMultipleEsAnnotations(
    @esIndex("not_analyzed")
    @column("personName")
    name: String,
    @column("personBirthday")
    @esFormat("yyyy-MM-dd")
    birthday: java.sql.Date
)

DataModelGenerator.generate[PersonWithMultipleEsAnnotations](dialects.ElasticsearchDialect)

{
   "template" : "people*",
   "settings" : {
      "number_of_shards" : 1,
      "number_of_replicas" : 3
   },
   "mappings" : {
      "people" : {
         "properties" : {
            "personName" : {"type" : "string", "index" : "not_analyzed"},
            "personBirthday" : {"type" : "date", "format" : "yyyy-MM-dd"}
         }
      }
   }
}

mateuszboryn/data-model-generator

data-model-generator

Table of contents

Goals

Getting started

Dialects

H2 dialect

Hive dialect

Redshift dialect

MySQL dialect

Avro schema dialect

Avro schema

Avro schema for Avro Schema Registry

Elasticsearch dialect

Java dialect

Installers

Register Avro schema to Avro schema registry

Copy Avro schema to HDFS

Create Elasticsearch index

Create Elasticsearch template

Create Hive table

Extracting class metadata

Customizations

Custom column name

Custom column name specific for dialect

Custom table name

Custom table name specific for dialect

Placeholders

Documentation comments

H2

Hive

Redshift

Avro schema

Column length

Not null

H2 - not null

Redshift - not null

Avro schema - not null

Underscore

Hive customizations

Hive external table

Hive ROW FORMAT SERDE

Hive STORED AS

Hive TABLE PROPERTIES

Hive avro schema url property

Hive partition columns

Hive partition columns - order

Hive Parquet table with many annotations

Hive Avro table with many annotations

Skip table generation if it is unchanged

Elasticsearch customizations

index settings

index parameter

format parameter

Elasticsearch template

Elasticsearch multiple annotations