bblfsh/python-client

[feature request] add position information

EgorBu opened this issue · 8 comments

Hi,
I found that python-driver has lack of position information for several types of tokens.

import bblfsh

client = bblfsh.BblfshClient("0.0.0.0:9432")
file_loc = "location/of/file.py"

# read content
with open(file_loc, "r") as f:
    content = f.read()

# extract uast
uast = client.parse(file_loc).uast

# select nodes with tokens and sort them by position
nodes = []
for node in bblfsh.iterator(uast, bblfsh.TreeOrder.PRE_ORDER):
    if node.token:
        nodes.append(node)
nodes = list(sorted(nodes, key=lambda n: n.start_position.offset))

# print token position, token, select source by position information
for n in nodes:
    print(n.start_position.offset, n.token,
              content[n.start_position.offset:n.start_position.offset + len(n.token)],
              content[n.start_position.offset:n.end_position.offset + 1],
              sep="|")

The source code I used is in details

import argparse
import os
import tempfile
import unittest

import sourced.ml.tests.models as paths
from sourced.ml.models import Topics
from sourced.ml.cmd import bigartm2asdf


class TopicsTests(unittest.TestCase):
    def setUp(self):
        self.model = Topics().load(source=paths.TOPICS)

    def test_dump(self):
        res = self.model.dump()
        self.assertEqual(res, """320 topics, 1000 tokens
First 10 tokens: ['ulcancel', 'domainlin', 'trudi', 'fncreateinstancedbaselin', 'wbnz', 'lmultiplicand', 'otronumero', 'qxln', 'gvgq', 'polaroidish']
Topics: unlabeled
non-zero elements: 6211  (0.019409)""")  # noqa

    def test_props(self):
        self.assertEqual(len(self.model), 320)
        self.assertEqual(len(self.model.tokens), 1000)
        self.assertIsNone(self.model.topics)
        zt = self.model[0]
        self.assertEqual(len(zt), 8)
        self.assertEqual(zt[0][0], "olcustom")
        self.assertAlmostEqual(zt[0][1], 1.23752e-06, 6)

    def test_label(self):
        with self.assertRaises(ValueError):
            self.model.label_topics([1, 2, 3])
        with self.assertRaises(TypeError):
            self.model.label_topics(list(range(320)))
        self.model.label_topics([str(i) for i in range(320)])
        self.assertEqual(self.model.topics[0], "0")

    def test_save(self):
        with tempfile.NamedTemporaryFile(prefix="sourced.ml-topics-test-") as f:
            self.model.save(f.name)
            new = Topics().load(f.name)
            self.assertEqual(self.model.tokens, new.tokens)
            self.assertEqual((self.model.matrix != new.matrix).getnnz(), 0)

    def test_bigartm2asdf(self):
        with tempfile.NamedTemporaryFile(prefix="sourced.ml-topics-test-") as f:
            args = argparse.Namespace(
                input=os.path.join(os.path.dirname(__file__), paths.TOPICS_SRC),
                output=f.name)
            bigartm2asdf(args)
            model = Topics().load(f.name)
            self.assertEqual(len(model), 320)
            self.assertEqual(len(model.tokens), 1000)


if __name__ == "__main__":
    unittest.main()

As result we may notice seral tokens without position information:

0|argparse|import a|i
0|os|im|i
0|tempfile|import a|i
0|unittest|import a|i
0|sourced.ml.tests.models|import argparse
import |i
0|paths|impor|i
0|sourced.ml.models|import argparse
i|i
0|Topics|import|i
0|sourced.ml.cmd|import argpars|i
0|bigartm2asdf|import argpa|i
0|source|import|i
0|!=|im|i
0|prefix|import|i
0|input|impor|i
0|output|import|i
0|prefix|import|i
0|==|im|i
184|TopicsTests|TopicsTests|TopicsTests

some of them are imports like

0|argparse|import a|i
0|os|im|i

some operators

0|==|im|i
0|!=|im|i

some arguments

0|source|import|i
0|prefix|import|i
0|input|impor|i
0|output|import|i
0|prefix|import|i

I made the small script that should help understand which positions are missed:

def ordered_nodes(uast):
    """
    Select nodes with tokens and order by `start_position.offset`
    """
    nodes = []
    for node in bblfsh.iterator(uast, bblfsh.TreeOrder.PRE_ORDER):
        if node.token:
            nodes.append(node)
    return list(sorted(nodes, key=lambda n: n.start_position.offset))


def transform_content(content, uast, filler="_"):
    """
    Replace all nodes with token & position information with filler
    """
    nodes = ordered_nodes(uast)
    c = content
    
    # filter nodes
    nodes = [node for node in nodes if node.start_position.offset != node.end_position.offset]
    
    # replace tokens with filler
    def insert_into_str(c, start, end):
        return c[:start] + "".join([filler] * (end - start)) + c[end:]
    for node in nodes:
        c = insert_into_str(c, node.start_position.offset, node.end_position.offset + 1)
    return c

and results on code from details above:

import argparse
import os
import tempfile
import unittest

import sourced.ml.tests.models as paths
from sourced.ml.models import Topics
from sourced.ml.cmd import bigartm2asdf


class ___________(________.________):
    def _____(____):
        ____._____ = ______().____(source=_____.______)

    def _________(____):
        ___ = ____._____.____()
        ____.___________(___, _________________________________________________________________________________________________________________________________________________________________________________________________________________________________________) _______

    def __________(____):
        ____.___________(___(____._____), ___)
        ____.___________(___(____._____.______), ____)
        ____.____________(____._____.______)
        __ = ____._____[0]
        ____.___________(___(__), 8)
        ____.___________(__[0][0], __________)
        ____._________________(__[0][1], ___________, 6)

    def __________(____):
        ____ ____.____________(__________):
            ____._____.____________([1, 2, 3])
        ____ ____.____________(_________):
            ____._____.____________(____(_____(___)))
        ____._____.____________([___(i) for i in _____(___)])
        ____.___________(____._____.______[0], ___)

    def _________(____):
        ____ ________.__________________(prefix=_________________________) as f:
            ____._____.____(f.____)
            ___ = ______().____(f.____)
            ____.___________(____._____.______, ___.______)
            ____.___________((____._____.______ != ___.______).______(), 0)

    def _________________(____):
        ____ ________.__________________(prefix=_________________________) as f:
            ____ = ________._________(
                input=__.____.____(__.____._______(________), _____.__________),
                output=f.____)
            ____________(____)
            _____ = ______().____(f.____)
            ____.___________(___(_____), ___)
            ____.___________(___(_____.______), ____)


__ ________ == __________:
    ________.____()

For javascript - some positions are missed for positional arguments like /\<img/g:

import Parser from 'rss-parser';
import _ from 'lodash';

const parser = new Parser();

const mediumFeed = 'https://medium.freecodecamp.org/feed';

function getExtract(str) {
  return str.slice(0, str.indexOf('</p>') + 4);
}


function addResponsiveClass(str) {
  return str.replace(/\<img/g, '<img class="img-responsive"');
}

export function getMediumFeed() {
  return new Promise((resolve, reject) => {
    parser.parseURL(mediumFeed, (err, feed) => {
      if (err) {
        reject(err);
      }

      const items = feed.items
        .map(
          item => _.pick(item, ['title', 'link', 'isoDate', 'content:encoded'])
        )
        .map(
          (item) => ({
            ...item,
            extract: getExtract(item['content:encoded'])
          })
        )
        .map(item => _.omit(item, ['content:encoded']))
        .map(item => ({ ...item, extract: addResponsiveClass(item.extract)}));
      resolve(items);
    });
  });
}

visualization:

import ______ from ____________;
import _ from ________;

const ______ = new ______();

const __________ = ______________________________________;

function __________(___) {
  return ___._____(_, ___._______(______) + _);
}


function __________________(___) {
  return ___._______(/\<img/g, _____________________________);
}

export function _____________() {
  return new _______((_______, ______) => {
    ______.________(__________, (___, ____) => {
      if (___) {
        ______(___);
      }

      const _____ = ____._____
        .___(
          ____ => _.____(____, [_______, ______, _________, _________________])
        )
        .___(
          (____) => ({
            ...____,
            _______: __________(____[_________________])
          })
        )
        .___(____ => _.____(____, [_________________]))
        .___(____ => ({ ...____, _______: __________________(____._______)}));
      _______(_____);
    });
  });
}

These issues should go to the Python and Javascript drivers projects. That script is cool!

For the Python driver, we currently fix tokens that at least have a line number but not column (with are a lot) using a tokenizer, but there are some nodes like import names and some parameters that don't even have a line number. A workaround would be to do something like (pseudocode):

curLine = parentNodeLine()
while curLine < len(storedLines):
    tokenPos = findTokenInLine(curLine)
    if tokenPos:
        break
    curLine += 1

This could potentially be somewhat slow but would fix most if not all of the current issues.

how can it handle corner case like:

f1(arg=bla), f2(arg=bla_bla)

?
arg will be in two places, functions will have the same line

The current token synchronizer removes a token from the list of tokens in lines (list of list) when it gets the position of it so at the time of getting the position of the second arg there would only be one. But that's just a proposal that could be relatively easy to integrate with what we have, there could be others.

smola commented

You might want to follow this: https://bugs.python.org/issue33337

The script to show missing tokens seem to have some problems, the output is garbled and it's showing some tokens as missing when in fact they're there (like assertIs and others):

https://gist.github.com/juanjux/4d29ae5aec8f2953a1bb47d75034288a