lexIt() does not cope well with Java Unicode escapes
Opened this issue · 3 comments
eddieantonio commented
Try lexing this file from apache/che:
$ javac B.java
$ python lex.py
[('PACKAGE', 'package', (1, 0), (1, 7), 'package'),
('IDENTIFIER', 'p', (1, 8), (1, 9), 'p'),
('SEMI', ';', (1, 9), (1, 10), ';'),
('PUBLIC', 'public', (2, 0), (2, 6), 'public'),
('CLASS', 'class', (2, 7), (2, 12), 'class'),
('IDENTIFIER', '2', (2, 18), (2, 19), 'B'),
('LBRACE', '{', (2, 20), (2, 21), '{'),
('IDENTIFIER', 'p', (3, 1), (3, 2), 'p'),
('DOT', '.', (3, 2), (3, 3), '.'),
('IDENTIFIER', 'B', (3, 3), (3, 4), 'B'),
('IDENTIFIER', 'b', (3, 5), (3, 6), 'b'),
('SEMI', ';', (3, 6), (3, 7), ';'),
('IDENTIFIER', 'p', (4, 1), (4, 2), 'p'),
('DOT', '.\\u004', (4, 2), (4, 8), '.'),
('IDENTIFIER', '2', (4, 8), (4, 9), 'B'),
('IDENTIFIER', 'b2', (4, 10), (4, 12), 'b2'),
('SEMI', ';', (4, 12), (4, 13), ';'),
('IDENTIFIER', '0', (5, 6), (5, 7), 'p'),
('DOT', '.', (5, 7), (5, 8), '.'),
('IDENTIFIER', 'B', (5, 9), (5, 10), 'B'),
('IDENTIFIER', 'b3', (5, 11), (5, 13), 'b3'),
('SEMI', ';', (5, 13), (5, 14), ';'),
('IDENTIFIER', '0', (6, 6), (6, 7), 'p'),
('DOT', '.\\u004', (6, 7), (6, 13), '.'),
('IDENTIFIER', '2', (6, 13), (6, 14), 'B'),
('IDENTIFIER', 'b4', (6, 15), (6, 17), 'b4'),
('SEMI', ';', (6, 17), (6, 18), ';'),
('PUBLIC', 'public', (8, 1), (8, 7), 'public'),
('STATIC', 'static', (8, 8), (8, 14), 'static'),
('IDENTIFIER', '2', (8, 20), (8, 21), 'B'),
('IDENTIFIER', 'newInstance', (8, 22), (8, 33), 'newInstance'),
('LPAREN', '(', (8, 33), (8, 34), '('),
('RPAREN', ')', (8, 34), (8, 35), ')'),
('LBRACE', '{', (8, 36), (8, 37), '{'),
('RETURN', 'return', (9, 2), (9, 8), 'return'),
('NEW', 'new', (9, 9), (9, 12), 'new'),
('IDENTIFIER', '2', (9, 18), (9, 19), 'B'),
('LPAREN', '(', (9, 19), (9, 20), '('),
('RPAREN', ')', (9, 20), (9, 21), ')'),
('SEMI', ';', (9, 21), (9, 22), ';'),
('RBRACE', '}', (10, 1), (10, 2), '}'),
('PUBLIC', 'public', (12, 1), (12, 7), 'public'),
('IDENTIFIER', '2', (12, 13), (12, 14), 'B'),
('LPAREN', '(', (12, 14), (12, 15), '('),
('RPAREN', ')', (12, 15), (12, 16), ')'),
('LBRACE', '{', (12, 17), (12, 18), '{'),
('RBRACE', '}', (13, 1), (13, 2), '}'),
('RBRACE', '}', (14, 0), (14, 1), '}'),
('EOF', '', (14, 1), (14, 1), 'token.end-of-input')]
The "raw" source has some totally funky characters!
What's worse, not properly handling The Java SE Specification Section §3.5 crashes the server!
A.java
's last byte is 0x1A, or the ASCII Substitute character. javac
will parse this just fine.
$ javac A.java
But javac-parser crashes!
$ python lex.py A.java
Traceback (most recent call last):
File "lex.py", line 9, in <module>
tokens = Java().lex(source.read())
File "/Users/eddieantonio/Projects/javac-parser/javac_parser.py", line 147, in lex
return [convert(l) for l in self.lex_call(java_source)]
File "/Users/eddieantonio/Projects/javac-parser/javac_parser.py", line 108, in lex_call
binary = self.app.lexFlat(java_source)
File "/Users/eddieantonio/.pyenv/versions/javac-parser/lib/python3.6/site-packages/py4j/java_gateway.py", line 1160, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/Users/eddieantonio/.pyenv/versions/javac-parser/lib/python3.6/site-packages/py4j/protocol.py", line 320, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o0.lexFlat.
: java.lang.StringIndexOutOfBoundsException: String index out of range: -1
at java.lang.String.substring(String.java:1967)
at ca.ualberta.cs.ParserWrapper.lexIt(ParserWrapper.java:106)
at ca.ualberta.cs.App.lexFlat(App.java:29)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
See these two files for reproduction:
abramhindle commented
Likely candidate is UTF-8 encoding and decoding. Java loves UTF-8 so maybe pin to that.
Deleted user commented
Can you add a test that fails?
On Sep 14, 2017 13:26, "Abram Hindle" <notifications@github.com> wrote:
Likely candidate is UTF-8 encoding and decoding. Java loves UTF-8 so maybe
pin to that.
—
You are receiving this because you are subscribed to this thread.
Reply to this email directly, view it on GitHub
<#1 (comment)>,
or mute the thread
<https://github.com/notifications/unsubscribe-auth/AFX3Dn3d4trI1pMVJ3tY07hvir_QP2Rpks5siX3hgaJpZM4PYC1j>
.
eddieantonio commented