google/lovefield

GroupBy "computed" value

iticonan opened this issue · 3 comments

In short I want to group by date part of a DATE_TIME field. e.g in sqlite this would yield total sales by day (assuming date holds a timestamp of seconds since unix epoch) :
SELECT date, sum(amount) FROM sales GROUP BY sales.date/(60*60*24)
Is there a way to achieve this in lovefield? short of doing all the grouping by code I haven't been able to: Here's a minimalistic example:

var schemaBuilder = lf.schema.create('computed_groupby');
var testDb, sales, rows;
schemaBuilder.createTable('sales')
  .addColumn('id',lf.Type.INTEGER)
  .addColumn('rep',lf.Type.STRING)
  .addColumn('date',lf.Type.DATE_TIME)
  .addColumn('amount',lf.Type.INTEGER)
  .addPrimaryKey(['id'])

schemaBuilder.connect().then( database =>{
  testDb = database
  sales = database.getSchema().table('sales')
  console.log('captured testDB and sales variables')
  rows = []
  let now = new Date().getTime()
  rows.push( sales.createRow({'id':1, 'rep':'alice','date':new Date(now),'amount':100}) )
  rows.push( sales.createRow({'id':2, 'rep':'bob','date':new Date(now+1000),'amount':200}) )
  rows.push( sales.createRow({'id':3, 'rep':'bob','date':new Date(now+2000),'amount':300}) )
  rows.push( sales.createRow({'id':4, 'rep':'carl','date':new Date(now+3000),'amount':400}) )
  rows.push( sales.createRow({'id':5, 'rep':'carl','date':new Date(now+4000),'amount':500}) )
  rows.push( sales.createRow({'id':6, 'rep':'carl','date':new Date(now+5000),'amount':600}) )
  let aDay = 1000*60*60*24
  rows.push( sales.createRow({'id':7, 'rep':'alice','date':new Date(now+aDay),'amount':100}) )
  return testDb.insertOrReplace().into(sales).values(rows).exec()
}).then( () => {
  console.log('inserted rows')
  testDb.select( sales.date, sales.rep, lf.fn.sum(sales.amount))
    .from(sales).groupBy(lf.fn.distinct(sales.date), sales.rep).exec()
    .then( ans => {console.log(ans);console.table(ans) })
})

the last select with distinct almost works, but the last registry should be in a new row.
This is also probably a bug since all dates are distinct by definition it should not group by anything.
More importantly can this be done currently and if not implemented in a future release?
I wouldn't mind having some api for it, e.g ...groupBy(lf.fn.computed(sales.date,'/',1000360024))... but hopefully someone can think of a more compact/elegant solution.
Thanks

This is very unlikely to be provided in Lovefield. I can think of a workaround is adding an additional column to store just the day and group by that, or make your date time value less precise (JS Date goes to milliseconds, if you care about only the day, you don't need that accuracy).

Reason we're not going to provide that is this involving implementing a full subquery engine, which will bloat the API and engine size significantly. For a pure JS implementation it's not feasible per our evaluation.

I've 'kinda' solved my problem so I'm sharing my workaround. Basically I create a helper table with the required computed ranges.

var fillGroupHelper = function(_db, _min, _max, step){
  return new Promise( (resolve, reject) => {
    let tab = _db.getSchema().table('ranges')
    var arr = []
    _db.delete().from(tab).exec().then( () => {
      console.log('deleted * from ranges')
      for( var c=(_min/step|0)*step; c<_max; c+=step ){
        arr.push( tab.createRow({'min':c, 'max':c+step}) )
      }
      return _db.insertOrReplace().into(tab).values(arr).exec()
    }).then( ans => {
       return _db.select().from(tab).exec()
    }).then( ans => {
      console.table(ans)
      ans.forEach( r => { console.log(`min:${r.min} max:${r.max}`) })
      resolve()
    })
  })
}
var schemaBuilder = lf.schema.create('asisted_groupby');
var testDb, sales, ranges, rows, aDay, now = new Date().getTime()/1000|0;
schemaBuilder.createTable('sales')
.addColumn('id',lf.Type.INTEGER)
.addColumn('rep',lf.Type.STRING)
.addColumn('date',lf.Type.INTEGER)
.addColumn('amount',lf.Type.INTEGER)
.addPrimaryKey(['id']);
schemaBuilder.createTable('ranges')
.addColumn('min', lf.Type.INTEGER)
.addColumn('max', lf.Type.INTEGER)
.addPrimaryKey(['min']);
schemaBuilder.connect().then( database =>{
 testDb = database
 sales = database.getSchema().table('sales')
 ranges = database.getSchema().table('ranges')
 console.log('captured testDB,sales and ranges variables')
 rows = []
 rows.push( sales.createRow({'id':1, 'rep':'alice','date':now,'amount':100}) )
 rows.push( sales.createRow({'id':2, 'rep':'bob','date':now+1,'amount':200}) )
 rows.push( sales.createRow({'id':3, 'rep':'bob','date':now+2,'amount':300}) )
 rows.push( sales.createRow({'id':4, 'rep':'carl','date':now+3,'amount':500}) )
 rows.push( sales.createRow({'id':5, 'rep':'carl','date':now+4,'amount':500}) )
 aDay = 86400
 rows.push( sales.createRow({'id':6, 'rep':'carl','date':now-aDay,'amount':500}) )
 rows.push( sales.createRow({'id':7, 'rep':'alice','date':now+aDay,'amount':100}) )
 return testDb.insertOrReplace().into(sales).values(rows).exec()
}).then( () => {
  console.log('inserted rows in sales')
  let offset = new Date().getTimezoneOffset()*60
  return fillGroupHelper(testDb, now - aDay - offset, now + aDay - offset, aDay)
}).then( () => {
  return testDb.select(ranges.min, sales.rep, lf.fn.sum(sales.amount))
  .from(sales, ranges)
  .where(lf.op.and(sales.date.gte(ranges.min), sales.date.lt(ranges.max)))
  .groupBy(ranges.min, sales.rep)
  .exec()
}).then( ans => {
  console.log(ans);
  ans.forEach( r => { console.log(`${r.ranges.min} ~ ${r.sales.rep} ~ `) })
})

My current drawbacks are:

  • I'll need to change my DATE_TIME to INTEGER anywhere I want to groupBy a range. ( I had some trouble with DATE_TIME, I suppose I could make it work but since I'm adding a table and never needed miliseconds might as well save some space)
  • only one query at the time can use it.
  • Probably less performance than a simple select and pure javascript grouping?

Pros:

  • Won't need to change schema again for other groupBys
  • Once setup only an extra function call needed. so minimal required code

PS: I still think the distinct behaviour in my first post is a bug

This is also probably a bug since all dates are distinct by definition it should not group by anything.

I'll investigate this and possibly open a new bug if I discover an issue.

I wouldn't mind having some api for it, e.g ...groupBy(lf.fn.computed(sales.date,'/',1000360024)).

FYI, there is a related proposal to allow expressions in queries at #141. It is not being worked on currently, but in theory similar approach could be used for SELECT queries, not just UPDATE.