diff options
author | cinap_lenrek <cinap_lenrek@localhost> | 2011-05-03 11:25:13 +0000 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@localhost> | 2011-05-03 11:25:13 +0000 |
commit | 458120dd40db6b4df55a4e96b650e16798ef06a0 (patch) | |
tree | 8f82685be24fef97e715c6f5ca4c68d34d5074ee /sys/src/cmd/python/Parser | |
parent | 3a742c699f6806c1145aea5149bf15de15a0afd7 (diff) |
add hg and python
Diffstat (limited to 'sys/src/cmd/python/Parser')
25 files changed, 6940 insertions, 0 deletions
diff --git a/sys/src/cmd/python/Parser/Python.asdl b/sys/src/cmd/python/Parser/Python.asdl new file mode 100644 index 000000000..fa4b406df --- /dev/null +++ b/sys/src/cmd/python/Parser/Python.asdl @@ -0,0 +1,115 @@ +-- ASDL's five builtin types are identifier, int, string, object, bool + +module Python version "$Revision: 43614 $" +{ + mod = Module(stmt* body) + | Interactive(stmt* body) + | Expression(expr body) + + -- not really an actual node but useful in Jython's typesystem. + | Suite(stmt* body) + + stmt = FunctionDef(identifier name, arguments args, + stmt* body, expr* decorators) + | ClassDef(identifier name, expr* bases, stmt* body) + | Return(expr? value) + + | Delete(expr* targets) + | Assign(expr* targets, expr value) + | AugAssign(expr target, operator op, expr value) + + -- not sure if bool is allowed, can always use int + | Print(expr? dest, expr* values, bool nl) + + -- use 'orelse' because else is a keyword in target languages + | For(expr target, expr iter, stmt* body, stmt* orelse) + | While(expr test, stmt* body, stmt* orelse) + | If(expr test, stmt* body, stmt* orelse) + | With(expr context_expr, expr? optional_vars, stmt* body) + + -- 'type' is a bad name + | Raise(expr? type, expr? inst, expr? tback) + | TryExcept(stmt* body, excepthandler* handlers, stmt* orelse) + | TryFinally(stmt* body, stmt* finalbody) + | Assert(expr test, expr? msg) + + | Import(alias* names) + | ImportFrom(identifier module, alias* names, int? level) + + -- Doesn't capture requirement that locals must be + -- defined if globals is + -- still supports use as a function! + | Exec(expr body, expr? globals, expr? locals) + + | Global(identifier* names) + | Expr(expr value) + | Pass | Break | Continue + + -- XXX Jython will be different + -- col_offset is the byte offset in the utf8 string the parser uses + attributes (int lineno, int col_offset) + + -- BoolOp() can use left & right? + expr = BoolOp(boolop op, expr* values) + | BinOp(expr left, operator op, expr right) + | UnaryOp(unaryop op, expr operand) + | Lambda(arguments args, expr body) + | IfExp(expr test, expr body, expr orelse) + | Dict(expr* keys, expr* values) + | ListComp(expr elt, comprehension* generators) + | GeneratorExp(expr elt, comprehension* generators) + -- the grammar constrains where yield expressions can occur + | Yield(expr? value) + -- need sequences for compare to distinguish between + -- x < 4 < 3 and (x < 4) < 3 + | Compare(expr left, cmpop* ops, expr* comparators) + | Call(expr func, expr* args, keyword* keywords, + expr? starargs, expr? kwargs) + | Repr(expr value) + | Num(object n) -- a number as a PyObject. + | Str(string s) -- need to specify raw, unicode, etc? + -- other literals? bools? + + -- the following expression can appear in assignment context + | Attribute(expr value, identifier attr, expr_context ctx) + | Subscript(expr value, slice slice, expr_context ctx) + | Name(identifier id, expr_context ctx) + | List(expr* elts, expr_context ctx) + | Tuple(expr *elts, expr_context ctx) + + -- col_offset is the byte offset in the utf8 string the parser uses + attributes (int lineno, int col_offset) + + expr_context = Load | Store | Del | AugLoad | AugStore | Param + + slice = Ellipsis | Slice(expr? lower, expr? upper, expr? step) + | ExtSlice(slice* dims) + | Index(expr value) + + boolop = And | Or + + operator = Add | Sub | Mult | Div | Mod | Pow | LShift + | RShift | BitOr | BitXor | BitAnd | FloorDiv + + unaryop = Invert | Not | UAdd | USub + + cmpop = Eq | NotEq | Lt | LtE | Gt | GtE | Is | IsNot | In | NotIn + + comprehension = (expr target, expr iter, expr* ifs) + + -- not sure what to call the first argument for raise and except + -- TODO(jhylton): Figure out if there is a better way to handle + -- lineno and col_offset fields, particularly when + -- ast is exposed to Python. + excepthandler = (expr? type, expr? name, stmt* body, int lineno, + int col_offset) + + arguments = (expr* args, identifier? vararg, + identifier? kwarg, expr* defaults) + + -- keyword arguments supplied to call + keyword = (identifier arg, expr value) + + -- import name with optional 'as' alias. + alias = (identifier name, identifier? asname) +} diff --git a/sys/src/cmd/python/Parser/acceler.c b/sys/src/cmd/python/Parser/acceler.c new file mode 100644 index 000000000..b41b2654f --- /dev/null +++ b/sys/src/cmd/python/Parser/acceler.c @@ -0,0 +1,125 @@ + +/* Parser accelerator module */ + +/* The parser as originally conceived had disappointing performance. + This module does some precomputation that speeds up the selection + of a DFA based upon a token, turning a search through an array + into a simple indexing operation. The parser now cannot work + without the accelerators installed. Note that the accelerators + are installed dynamically when the parser is initialized, they + are not part of the static data structure written on graminit.[ch] + by the parser generator. */ + +#include "pgenheaders.h" +#include "grammar.h" +#include "node.h" +#include "token.h" +#include "parser.h" + +/* Forward references */ +static void fixdfa(grammar *, dfa *); +static void fixstate(grammar *, state *); + +void +PyGrammar_AddAccelerators(grammar *g) +{ + dfa *d; + int i; + d = g->g_dfa; + for (i = g->g_ndfas; --i >= 0; d++) + fixdfa(g, d); + g->g_accel = 1; +} + +void +PyGrammar_RemoveAccelerators(grammar *g) +{ + dfa *d; + int i; + g->g_accel = 0; + d = g->g_dfa; + for (i = g->g_ndfas; --i >= 0; d++) { + state *s; + int j; + s = d->d_state; + for (j = 0; j < d->d_nstates; j++, s++) { + if (s->s_accel) + PyObject_FREE(s->s_accel); + s->s_accel = NULL; + } + } +} + +static void +fixdfa(grammar *g, dfa *d) +{ + state *s; + int j; + s = d->d_state; + for (j = 0; j < d->d_nstates; j++, s++) + fixstate(g, s); +} + +static void +fixstate(grammar *g, state *s) +{ + arc *a; + int k; + int *accel; + int nl = g->g_ll.ll_nlabels; + s->s_accept = 0; + accel = (int *) PyObject_MALLOC(nl * sizeof(int)); + if (accel == NULL) { + fprintf(stderr, "no mem to build parser accelerators\n"); + exit(1); + } + for (k = 0; k < nl; k++) + accel[k] = -1; + a = s->s_arc; + for (k = s->s_narcs; --k >= 0; a++) { + int lbl = a->a_lbl; + label *l = &g->g_ll.ll_label[lbl]; + int type = l->lb_type; + if (a->a_arrow >= (1 << 7)) { + printf("XXX too many states!\n"); + continue; + } + if (ISNONTERMINAL(type)) { + dfa *d1 = PyGrammar_FindDFA(g, type); + int ibit; + if (type - NT_OFFSET >= (1 << 7)) { + printf("XXX too high nonterminal number!\n"); + continue; + } + for (ibit = 0; ibit < g->g_ll.ll_nlabels; ibit++) { + if (testbit(d1->d_first, ibit)) { + if (accel[ibit] != -1) + printf("XXX ambiguity!\n"); + accel[ibit] = a->a_arrow | (1 << 7) | + ((type - NT_OFFSET) << 8); + } + } + } + else if (lbl == EMPTY) + s->s_accept = 1; + else if (lbl >= 0 && lbl < nl) + accel[lbl] = a->a_arrow; + } + while (nl > 0 && accel[nl-1] == -1) + nl--; + for (k = 0; k < nl && accel[k] == -1;) + k++; + if (k < nl) { + int i; + s->s_accel = (int *) PyObject_MALLOC((nl-k) * sizeof(int)); + if (s->s_accel == NULL) { + fprintf(stderr, "no mem to add parser accelerators\n"); + exit(1); + } + s->s_lower = k; + s->s_upper = nl; + for (i = 0; k < nl; i++, k++) + s->s_accel[i] = accel[k]; + } + PyObject_FREE(accel); +} diff --git a/sys/src/cmd/python/Parser/asdl.py b/sys/src/cmd/python/Parser/asdl.py new file mode 100644 index 000000000..bd892b695 --- /dev/null +++ b/sys/src/cmd/python/Parser/asdl.py @@ -0,0 +1,415 @@ +"""An implementation of the Zephyr Abstract Syntax Definition Language. + +See http://asdl.sourceforge.net/ and +http://www.cs.princeton.edu/~danwang/Papers/dsl97/dsl97-abstract.html. + +Only supports top level module decl, not view. I'm guessing that view +is intended to support the browser and I'm not interested in the +browser. + +Changes for Python: Add support for module versions +""" + +#__metaclass__ = type + +import os +import traceback + +import spark + +class Token: + # spark seems to dispatch in the parser based on a token's + # type attribute + def __init__(self, type, lineno): + self.type = type + self.lineno = lineno + + def __str__(self): + return self.type + + def __repr__(self): + return str(self) + +class Id(Token): + def __init__(self, value, lineno): + self.type = 'Id' + self.value = value + self.lineno = lineno + + def __str__(self): + return self.value + +class String(Token): + def __init__(self, value, lineno): + self.type = 'String' + self.value = value + self.lineno = lineno + +class ASDLSyntaxError: + + def __init__(self, lineno, token=None, msg=None): + self.lineno = lineno + self.token = token + self.msg = msg + + def __str__(self): + if self.msg is None: + return "Error at '%s', line %d" % (self.token, self.lineno) + else: + return "%s, line %d" % (self.msg, self.lineno) + +class ASDLScanner(spark.GenericScanner, object): + + def tokenize(self, input): + self.rv = [] + self.lineno = 1 + super(ASDLScanner, self).tokenize(input) + return self.rv + + def t_id(self, s): + r"[\w\.]+" + # XXX doesn't distinguish upper vs. lower, which is + # significant for ASDL. + self.rv.append(Id(s, self.lineno)) + + def t_string(self, s): + r'"[^"]*"' + self.rv.append(String(s, self.lineno)) + + def t_xxx(self, s): # not sure what this production means + r"<=" + self.rv.append(Token(s, self.lineno)) + + def t_punctuation(self, s): + r"[\{\}\*\=\|\(\)\,\?\:]" + self.rv.append(Token(s, self.lineno)) + + def t_comment(self, s): + r"\-\-[^\n]*" + pass + + def t_newline(self, s): + r"\n" + self.lineno += 1 + + def t_whitespace(self, s): + r"[ \t]+" + pass + + def t_default(self, s): + r" . +" + raise ValueError, "unmatched input: %s" % `s` + +class ASDLParser(spark.GenericParser, object): + def __init__(self): + super(ASDLParser, self).__init__("module") + + def typestring(self, tok): + return tok.type + + def error(self, tok): + raise ASDLSyntaxError(tok.lineno, tok) + + def p_module_0(self, (module, name, version, _0, _1)): + " module ::= Id Id version { } " + if module.value != "module": + raise ASDLSyntaxError(module.lineno, + msg="expected 'module', found %s" % module) + return Module(name, None, version) + + def p_module(self, (module, name, version, _0, definitions, _1)): + " module ::= Id Id version { definitions } " + if module.value != "module": + raise ASDLSyntaxError(module.lineno, + msg="expected 'module', found %s" % module) + return Module(name, definitions, version) + + def p_version(self, (version, V)): + "version ::= Id String" + if version.value != "version": + raise ASDLSyntaxError(version.lineno, + msg="expected 'version', found %" % version) + return V + + def p_definition_0(self, (definition,)): + " definitions ::= definition " + return definition + + def p_definition_1(self, (definitions, definition)): + " definitions ::= definition definitions " + return definitions + definition + + def p_definition(self, (id, _, type)): + " definition ::= Id = type " + return [Type(id, type)] + + def p_type_0(self, (product,)): + " type ::= product " + return product + + def p_type_1(self, (sum,)): + " type ::= sum " + return Sum(sum) + + def p_type_2(self, (sum, id, _0, attributes, _1)): + " type ::= sum Id ( fields ) " + if id.value != "attributes": + raise ASDLSyntaxError(id.lineno, + msg="expected attributes, found %s" % id) + if attributes: + attributes.reverse() + return Sum(sum, attributes) + + def p_product(self, (_0, fields, _1)): + " product ::= ( fields ) " + # XXX can't I just construct things in the right order? + fields.reverse() + return Product(fields) + + def p_sum_0(self, (constructor,)): + " sum ::= constructor """ + return [constructor] + + def p_sum_1(self, (constructor, _, sum)): + " sum ::= constructor | sum " + return [constructor] + sum + + def p_sum_2(self, (constructor, _, sum)): + " sum ::= constructor | sum " + return [constructor] + sum + + def p_constructor_0(self, (id,)): + " constructor ::= Id " + return Constructor(id) + + def p_constructor_1(self, (id, _0, fields, _1)): + " constructor ::= Id ( fields ) " + # XXX can't I just construct things in the right order? + fields.reverse() + return Constructor(id, fields) + + def p_fields_0(self, (field,)): + " fields ::= field " + return [field] + + def p_fields_1(self, (field, _, fields)): + " fields ::= field , fields " + return fields + [field] + + def p_field_0(self, (type,)): + " field ::= Id " + return Field(type) + + def p_field_1(self, (type, name)): + " field ::= Id Id " + return Field(type, name) + + def p_field_2(self, (type, _, name)): + " field ::= Id * Id " + return Field(type, name, seq=1) + + def p_field_3(self, (type, _, name)): + " field ::= Id ? Id " + return Field(type, name, opt=1) + + def p_field_4(self, (type, _)): + " field ::= Id * " + return Field(type, seq=1) + + def p_field_5(self, (type, _)): + " field ::= Id ? " + return Field(type, opt=1) + +builtin_types = ("identifier", "string", "int", "bool", "object") + +# below is a collection of classes to capture the AST of an AST :-) +# not sure if any of the methods are useful yet, but I'm adding them +# piecemeal as they seem helpful + +class AST: + pass # a marker class + +class Module(AST): + def __init__(self, name, dfns, version): + self.name = name + self.dfns = dfns + self.version = version + self.types = {} # maps type name to value (from dfns) + for type in dfns: + self.types[type.name.value] = type.value + + def __repr__(self): + return "Module(%s, %s)" % (self.name, self.dfns) + +class Type(AST): + def __init__(self, name, value): + self.name = name + self.value = value + + def __repr__(self): + return "Type(%s, %s)" % (self.name, self.value) + +class Constructor(AST): + def __init__(self, name, fields=None): + self.name = name + self.fields = fields or [] + + def __repr__(self): + return "Constructor(%s, %s)" % (self.name, self.fields) + +class Field(AST): + def __init__(self, type, name=None, seq=0, opt=0): + self.type = type + self.name = name + self.seq = seq + self.opt = opt + + def __repr__(self): + if self.seq: + extra = ", seq=1" + elif self.opt: + extra = ", opt=1" + else: + extra = "" + if self.name is None: + return "Field(%s%s)" % (self.type, extra) + else: + return "Field(%s, %s%s)" % (self.type, self.name, extra) + +class Sum(AST): + def __init__(self, types, attributes=None): + self.types = types + self.attributes = attributes or [] + + def __repr__(self): + if self.attributes is None: + return "Sum(%s)" % self.types + else: + return "Sum(%s, %s)" % (self.types, self.attributes) + +class Product(AST): + def __init__(self, fields): + self.fields = fields + + def __repr__(self): + return "Product(%s)" % self.fields + +class VisitorBase(object): + + def __init__(self, skip=0): + self.cache = {} + self.skip = skip + + def visit(self, object, *args): + meth = self._dispatch(object) + if meth is None: + return + try: + meth(object, *args) + except Exception, err: + print "Error visiting", repr(object) + print err + traceback.print_exc() + # XXX hack + if hasattr(self, 'file'): + self.file.flush() + os._exit(1) + + def _dispatch(self, object): + assert isinstance(object, AST), repr(object) + klass = object.__class__ + meth = self.cache.get(klass) + if meth is None: + methname = "visit" + klass.__name__ + if self.skip: + meth = getattr(self, methname, None) + else: + meth = getattr(self, methname) + self.cache[klass] = meth + return meth + +class Check(VisitorBase): + + def __init__(self): + super(Check, self).__init__(skip=1) + self.cons = {} + self.errors = 0 + self.types = {} + + def visitModule(self, mod): + for dfn in mod.dfns: + self.visit(dfn) + + def visitType(self, type): + self.visit(type.value, str(type.name)) + + def visitSum(self, sum, name): + for t in sum.types: + self.visit(t, name) + + def visitConstructor(self, cons, name): + key = str(cons.name) + conflict = self.cons.get(key) + if conflict is None: + self.cons[key] = name + else: + print "Redefinition of constructor %s" % key + print "Defined in %s and %s" % (conflict, name) + self.errors += 1 + for f in cons.fields: + self.visit(f, key) + + def visitField(self, field, name): + key = str(field.type) + l = self.types.setdefault(key, []) + l.append(name) + + def visitProduct(self, prod, name): + for f in prod.fields: + self.visit(f, name) + +def check(mod): + v = Check() + v.visit(mod) + + for t in v.types: + if not mod.types.has_key(t) and not t in builtin_types: + v.errors += 1 + uses = ", ".join(v.types[t]) + print "Undefined type %s, used in %s" % (t, uses) + + return not v.errors + +def parse(file): + scanner = ASDLScanner() + parser = ASDLParser() + + buf = open(file).read() + tokens = scanner.tokenize(buf) + try: + return parser.parse(tokens) + except ASDLSyntaxError, err: + print err + lines = buf.split("\n") + print lines[err.lineno - 1] # lines starts at 0, files at 1 + +if __name__ == "__main__": + import glob + import sys + + if len(sys.argv) > 1: + files = sys.argv[1:] + else: + testdir = "tests" + files = glob.glob(testdir + "/*.asdl") + + for file in files: + print file + mod = parse(file) + print "module", mod.name + print len(mod.dfns), "definitions" + if not check(mod): + print "Check failed" + else: + for dfn in mod.dfns: + print dfn.type diff --git a/sys/src/cmd/python/Parser/asdl_c.py b/sys/src/cmd/python/Parser/asdl_c.py new file mode 100755 index 000000000..2974d7b15 --- /dev/null +++ b/sys/src/cmd/python/Parser/asdl_c.py @@ -0,0 +1,782 @@ +#! /usr/bin/env python +"""Generate C code from an ASDL description.""" + +# TO DO +# handle fields that have a type but no name + +import os, sys, traceback + +import asdl + +TABSIZE = 8 +MAX_COL = 80 + +def get_c_type(name): + """Return a string for the C name of the type. + + This function special cases the default types provided by asdl: + identifier, string, int, bool. + """ + # XXX ack! need to figure out where Id is useful and where string + if isinstance(name, asdl.Id): + name = name.value + if name in asdl.builtin_types: + return name + else: + return "%s_ty" % name + +def reflow_lines(s, depth): + """Reflow the line s indented depth tabs. + + Return a sequence of lines where no line extends beyond MAX_COL + when properly indented. The first line is properly indented based + exclusively on depth * TABSIZE. All following lines -- these are + the reflowed lines generated by this function -- start at the same + column as the first character beyond the opening { in the first + line. + """ + size = MAX_COL - depth * TABSIZE + if len(s) < size: + return [s] + + lines = [] + cur = s + padding = "" + while len(cur) > size: + i = cur.rfind(' ', 0, size) + # XXX this should be fixed for real + if i == -1 and 'GeneratorExp' in cur: + i = size + 3 + assert i != -1, "Impossible line %d to reflow: %s" % (size, `s`) + lines.append(padding + cur[:i]) + if len(lines) == 1: + # find new size based on brace + j = cur.find('{', 0, i) + if j >= 0: + j += 2 # account for the brace and the space after it + size -= j + padding = " " * j + else: + j = cur.find('(', 0, i) + if j >= 0: + j += 1 # account for the paren (no space after it) + size -= j + padding = " " * j + cur = cur[i+1:] + else: + lines.append(padding + cur) + return lines + +def is_simple(sum): + """Return True if a sum is a simple. + + A sum is simple if its types have no fields, e.g. + unaryop = Invert | Not | UAdd | USub + """ + + for t in sum.types: + if t.fields: + return False + return True + +class EmitVisitor(asdl.VisitorBase): + """Visit that emits lines""" + + def __init__(self, file): + self.file = file + super(EmitVisitor, self).__init__() + + def emit(self, s, depth, reflow=1): + # XXX reflow long lines? + if reflow: + lines = reflow_lines(s, depth) + else: + lines = [s] + for line in lines: + line = (" " * TABSIZE * depth) + line + "\n" + self.file.write(line) + +class TypeDefVisitor(EmitVisitor): + def visitModule(self, mod): + for dfn in mod.dfns: + self.visit(dfn) + + def visitType(self, type, depth=0): + self.visit(type.value, type.name, depth) + + def visitSum(self, sum, name, depth): + if is_simple(sum): + self.simple_sum(sum, name, depth) + else: + self.sum_with_constructors(sum, name, depth) + + def simple_sum(self, sum, name, depth): + enum = [] + for i in range(len(sum.types)): + type = sum.types[i] + enum.append("%s=%d" % (type.name, i + 1)) + enums = ", ".join(enum) + ctype = get_c_type(name) + s = "typedef enum _%s { %s } %s;" % (name, enums, ctype) + self.emit(s, depth) + self.emit("", depth) + + def sum_with_constructors(self, sum, name, depth): + ctype = get_c_type(name) + s = "typedef struct _%(name)s *%(ctype)s;" % locals() + self.emit(s, depth) + self.emit("", depth) + + def visitProduct(self, product, name, depth): + ctype = get_c_type(name) + s = "typedef struct _%(name)s *%(ctype)s;" % locals() + self.emit(s, depth) + self.emit("", depth) + +class StructVisitor(EmitVisitor): + """Visitor to generate typdefs for AST.""" + + def visitModule(self, mod): + for dfn in mod.dfns: + self.visit(dfn) + + def visitType(self, type, depth=0): + self.visit(type.value, type.name, depth) + + def visitSum(self, sum, name, depth): + if not is_simple(sum): + self.sum_with_constructors(sum, name, depth) + + def sum_with_constructors(self, sum, name, depth): + def emit(s, depth=depth): + self.emit(s % sys._getframe(1).f_locals, depth) + enum = [] + for i in range(len(sum.types)): + type = sum.types[i] + enum.append("%s_kind=%d" % (type.name, i + 1)) + + emit("enum _%(name)s_kind {" + ", ".join(enum) + "};") + + emit("struct _%(name)s {") + emit("enum _%(name)s_kind kind;", depth + 1) + emit("union {", depth + 1) + for t in sum.types: + self.visit(t, depth + 2) + emit("} v;", depth + 1) + for field in sum.attributes: + # rudimentary attribute handling + type = str(field.type) + assert type in asdl.builtin_types, type + emit("%s %s;" % (type, field.name), depth + 1); + emit("};") + emit("") + + def visitConstructor(self, cons, depth): + if cons.fields: + self.emit("struct {", depth) + for f in cons.fields: + self.visit(f, depth + 1) + self.emit("} %s;" % cons.name, depth) + self.emit("", depth) + else: + # XXX not sure what I want here, nothing is probably fine + pass + + def visitField(self, field, depth): + # XXX need to lookup field.type, because it might be something + # like a builtin... + ctype = get_c_type(field.type) + name = field.name + if field.seq: + if field.type.value in ('cmpop',): + self.emit("asdl_int_seq *%(name)s;" % locals(), depth) + else: + self.emit("asdl_seq *%(name)s;" % locals(), depth) + else: + self.emit("%(ctype)s %(name)s;" % locals(), depth) + + def visitProduct(self, product, name, depth): + self.emit("struct _%(name)s {" % locals(), depth) + for f in product.fields: + self.visit(f, depth + 1) + self.emit("};", depth) + self.emit("", depth) + +class PrototypeVisitor(EmitVisitor): + """Generate function prototypes for the .h file""" + + def visitModule(self, mod): + for dfn in mod.dfns: + self.visit(dfn) + + def visitType(self, type): + self.visit(type.value, type.name) + + def visitSum(self, sum, name): + if is_simple(sum): + pass # XXX + else: + for t in sum.types: + self.visit(t, name, sum.attributes) + + def get_args(self, fields): + """Return list of C argument into, one for each field. + + Argument info is 3-tuple of a C type, variable name, and flag + that is true if type can be NULL. + """ + args = [] + unnamed = {} + for f in fields: + if f.name is None: + name = f.type + c = unnamed[name] = unnamed.get(name, 0) + 1 + if c > 1: + name = "name%d" % (c - 1) + else: + name = f.name + # XXX should extend get_c_type() to handle this + if f.seq: + if f.type.value in ('cmpop',): + ctype = "asdl_int_seq *" + else: + ctype = "asdl_seq *" + else: + ctype = get_c_type(f.type) + args.append((ctype, name, f.opt or f.seq)) + return args + + def visitConstructor(self, cons, type, attrs): + args = self.get_args(cons.fields) + attrs = self.get_args(attrs) + ctype = get_c_type(type) + self.emit_function(cons.name, ctype, args, attrs) + + def emit_function(self, name, ctype, args, attrs, union=1): + args = args + attrs + if args: + argstr = ", ".join(["%s %s" % (atype, aname) + for atype, aname, opt in args]) + argstr += ", PyArena *arena" + else: + argstr = "PyArena *arena" + margs = "a0" + for i in range(1, len(args)+1): + margs += ", a%d" % i + self.emit("#define %s(%s) _Py_%s(%s)" % (name, margs, name, margs), 0, + reflow = 0) + self.emit("%s _Py_%s(%s);" % (ctype, name, argstr), 0) + + def visitProduct(self, prod, name): + self.emit_function(name, get_c_type(name), + self.get_args(prod.fields), [], union=0) + +class FunctionVisitor(PrototypeVisitor): + """Visitor to generate constructor functions for AST.""" + + def emit_function(self, name, ctype, args, attrs, union=1): + def emit(s, depth=0, reflow=1): + self.emit(s, depth, reflow) + argstr = ", ".join(["%s %s" % (atype, aname) + for atype, aname, opt in args + attrs]) + if argstr: + argstr += ", PyArena *arena" + else: + argstr = "PyArena *arena" + self.emit("%s" % ctype, 0) + emit("%s(%s)" % (name, argstr)) + emit("{") + emit("%s p;" % ctype, 1) + for argtype, argname, opt in args: + # XXX hack alert: false is allowed for a bool + if not opt and not (argtype == "bool" or argtype == "int"): + emit("if (!%s) {" % argname, 1) + emit("PyErr_SetString(PyExc_ValueError,", 2) + msg = "field %s is required for %s" % (argname, name) + emit(' "%s");' % msg, + 2, reflow=0) + emit('return NULL;', 2) + emit('}', 1) + + emit("p = (%s)PyArena_Malloc(arena, sizeof(*p));" % ctype, 1); + emit("if (!p) {", 1) + emit("PyErr_NoMemory();", 2) + emit("return NULL;", 2) + emit("}", 1) + if union: + self.emit_body_union(name, args, attrs) + else: + self.emit_body_struct(name, args, attrs) + emit("return p;", 1) + emit("}") + emit("") + + def emit_body_union(self, name, args, attrs): + def emit(s, depth=0, reflow=1): + self.emit(s, depth, reflow) + emit("p->kind = %s_kind;" % name, 1) + for argtype, argname, opt in args: + emit("p->v.%s.%s = %s;" % (name, argname, argname), 1) + for argtype, argname, opt in attrs: + emit("p->%s = %s;" % (argname, argname), 1) + + def emit_body_struct(self, name, args, attrs): + def emit(s, depth=0, reflow=1): + self.emit(s, depth, reflow) + for argtype, argname, opt in args: + emit("p->%s = %s;" % (argname, argname), 1) + assert not attrs + +class PickleVisitor(EmitVisitor): + + def visitModule(self, mod): + for dfn in mod.dfns: + self.visit(dfn) + + def visitType(self, type): + self.visit(type.value, type.name) + + def visitSum(self, sum, name): + pass + + def visitProduct(self, sum, name): + pass + + def visitConstructor(self, cons, name): + pass + + def visitField(self, sum): + pass + +class MarshalPrototypeVisitor(PickleVisitor): + + def prototype(self, sum, name): + ctype = get_c_type(name) + self.emit("static int marshal_write_%s(PyObject **, int *, %s);" + % (name, ctype), 0) + + visitProduct = visitSum = prototype + +class PyTypesDeclareVisitor(PickleVisitor): + + def visitProduct(self, prod, name): + self.emit("static PyTypeObject *%s_type;" % name, 0) + self.emit("static PyObject* ast2obj_%s(void*);" % name, 0) + if prod.fields: + self.emit("static char *%s_fields[]={" % name,0) + for f in prod.fields: + self.emit('"%s",' % f.name, 1) + self.emit("};", 0) + + def visitSum(self, sum, name): + self.emit("static PyTypeObject *%s_type;" % name, 0) + if sum.attributes: + self.emit("static char *%s_attributes[] = {" % name, 0) + for a in sum.attributes: + self.emit('"%s",' % a.name, 1) + self.emit("};", 0) + ptype = "void*" + if is_simple(sum): + ptype = get_c_type(name) + tnames = [] + for t in sum.types: + tnames.append(str(t.name)+"_singleton") + tnames = ", *".join(tnames) + self.emit("static PyObject *%s;" % tnames, 0) + self.emit("static PyObject* ast2obj_%s(%s);" % (name, ptype), 0) + for t in sum.types: + self.visitConstructor(t, name) + + def visitConstructor(self, cons, name): + self.emit("static PyTypeObject *%s_type;" % cons.name, 0) + if cons.fields: + self.emit("static char *%s_fields[]={" % cons.name, 0) + for t in cons.fields: + self.emit('"%s",' % t.name, 1) + self.emit("};",0) + +class PyTypesVisitor(PickleVisitor): + + def visitModule(self, mod): + self.emit(""" +static PyTypeObject* make_type(char *type, PyTypeObject* base, char**fields, int num_fields) +{ + PyObject *fnames, *result; + int i; + if (num_fields) { + fnames = PyTuple_New(num_fields); + if (!fnames) return NULL; + } else { + fnames = Py_None; + Py_INCREF(Py_None); + } + for(i=0; i < num_fields; i++) { + PyObject *field = PyString_FromString(fields[i]); + if (!field) { + Py_DECREF(fnames); + return NULL; + } + PyTuple_SET_ITEM(fnames, i, field); + } + result = PyObject_CallFunction((PyObject*)&PyType_Type, "s(O){sOss}", + type, base, "_fields", fnames, "__module__", "_ast"); + Py_DECREF(fnames); + return (PyTypeObject*)result; +} + +static int add_attributes(PyTypeObject* type, char**attrs, int num_fields) +{ + int i, result; + PyObject *s, *l = PyList_New(num_fields); + if (!l) return 0; + for(i = 0; i < num_fields; i++) { + s = PyString_FromString(attrs[i]); + if (!s) { + Py_DECREF(l); + return 0; + } + PyList_SET_ITEM(l, i, s); + } + result = PyObject_SetAttrString((PyObject*)type, "_attributes", l) >= 0; + Py_DECREF(l); + return result; +} + +static PyObject* ast2obj_list(asdl_seq *seq, PyObject* (*func)(void*)) +{ + int i, n = asdl_seq_LEN(seq); + PyObject *result = PyList_New(n); + PyObject *value; + if (!result) + return NULL; + for (i = 0; i < n; i++) { + value = func(asdl_seq_GET(seq, i)); + if (!value) { + Py_DECREF(result); + return NULL; + } + PyList_SET_ITEM(result, i, value); + } + return result; +} + +static PyObject* ast2obj_object(void *o) +{ + if (!o) + o = Py_None; + Py_INCREF((PyObject*)o); + return (PyObject*)o; +} +#define ast2obj_identifier ast2obj_object +#define ast2obj_string ast2obj_object +static PyObject* ast2obj_bool(bool b) +{ + return PyBool_FromLong(b); +} + +static PyObject* ast2obj_int(bool b) +{ + return PyInt_FromLong(b); +} +""", 0, reflow=False) + + self.emit("static int init_types(void)",0) + self.emit("{", 0) + self.emit("static int initialized;", 1) + self.emit("if (initialized) return 1;", 1) + self.emit('AST_type = make_type("AST", &PyBaseObject_Type, NULL, 0);', 1) + for dfn in mod.dfns: + self.visit(dfn) + self.emit("initialized = 1;", 1) + self.emit("return 1;", 1); + self.emit("}", 0) + + def visitProduct(self, prod, name): + if prod.fields: + fields = name.value+"_fields" + else: + fields = "NULL" + self.emit('%s_type = make_type("%s", AST_type, %s, %d);' % + (name, name, fields, len(prod.fields)), 1) + self.emit("if (!%s_type) return 0;" % name, 1) + + def visitSum(self, sum, name): + self.emit('%s_type = make_type("%s", AST_type, NULL, 0);' % (name, name), 1) + self.emit("if (!%s_type) return 0;" % name, 1) + if sum.attributes: + self.emit("if (!add_attributes(%s_type, %s_attributes, %d)) return 0;" % + (name, name, len(sum.attributes)), 1) + else: + self.emit("if (!add_attributes(%s_type, NULL, 0)) return 0;" % name, 1) + simple = is_simple(sum) + for t in sum.types: + self.visitConstructor(t, name, simple) + + def visitConstructor(self, cons, name, simple): + if cons.fields: + fields = cons.name.value+"_fields" + else: + fields = "NULL" + self.emit('%s_type = make_type("%s", %s_type, %s, %d);' % + (cons.name, cons.name, name, fields, len(cons.fields)), 1) + self.emit("if (!%s_type) return 0;" % cons.name, 1) + if simple: + self.emit("%s_singleton = PyType_GenericNew(%s_type, NULL, NULL);" % + (cons.name, cons.name), 1) + self.emit("if (!%s_singleton) return 0;" % cons.name, 1) + +class ASTModuleVisitor(PickleVisitor): + + def visitModule(self, mod): + self.emit("PyMODINIT_FUNC", 0) + self.emit("init_ast(void)", 0) + self.emit("{", 0) + self.emit("PyObject *m, *d;", 1) + self.emit("if (!init_types()) return;", 1) + self.emit('m = Py_InitModule3("_ast", NULL, NULL);', 1) + self.emit("if (!m) return;", 1) + self.emit("d = PyModule_GetDict(m);", 1) + self.emit('if (PyDict_SetItemString(d, "AST", (PyObject*)AST_type) < 0) return;', 1) + self.emit('if (PyModule_AddIntConstant(m, "PyCF_ONLY_AST", PyCF_ONLY_AST) < 0)', 1) + self.emit("return;", 2) + # Value of version: "$Revision: 53490 $" + self.emit('if (PyModule_AddStringConstant(m, "__version__", "%s") < 0)' % mod.version.value[12:-3], 1) + self.emit("return;", 2) + for dfn in mod.dfns: + self.visit(dfn) + self.emit("}", 0) + + def visitProduct(self, prod, name): + self.addObj(name) + + def visitSum(self, sum, name): + self.addObj(name) + for t in sum.types: + self.visitConstructor(t, name) + + def visitConstructor(self, cons, name): + self.addObj(cons.name) + + def addObj(self, name): + self.emit('if (PyDict_SetItemString(d, "%s", (PyObject*)%s_type) < 0) return;' % (name, name), 1) + +_SPECIALIZED_SEQUENCES = ('stmt', 'expr') + +def find_sequence(fields, doing_specialization): + """Return True if any field uses a sequence.""" + for f in fields: + if f.seq: + if not doing_specialization: + return True + if str(f.type) not in _SPECIALIZED_SEQUENCES: + return True + return False + +def has_sequence(types, doing_specialization): + for t in types: + if find_sequence(t.fields, doing_specialization): + return True + return False + + +class StaticVisitor(PickleVisitor): + CODE = '''Very simple, always emit this static code. Overide CODE''' + + def visit(self, object): + self.emit(self.CODE, 0, reflow=False) + +class ObjVisitor(PickleVisitor): + + def func_begin(self, name): + ctype = get_c_type(name) + self.emit("PyObject*", 0) + self.emit("ast2obj_%s(void* _o)" % (name), 0) + self.emit("{", 0) + self.emit("%s o = (%s)_o;" % (ctype, ctype), 1) + self.emit("PyObject *result = NULL, *value = NULL;", 1) + self.emit('if (!o) {', 1) + self.emit("Py_INCREF(Py_None);", 2) + self.emit('return Py_None;', 2) + self.emit("}", 1) + self.emit('', 0) + + def func_end(self): + self.emit("return result;", 1) + self.emit("failed:", 0) + self.emit("Py_XDECREF(value);", 1) + self.emit("Py_XDECREF(result);", 1) + self.emit("return NULL;", 1) + self.emit("}", 0) + self.emit("", 0) + + def visitSum(self, sum, name): + if is_simple(sum): + self.simpleSum(sum, name) + return + self.func_begin(name) + self.emit("switch (o->kind) {", 1) + for i in range(len(sum.types)): + t = sum.types[i] + self.visitConstructor(t, i + 1, name) + self.emit("}", 1) + for a in sum.attributes: + self.emit("value = ast2obj_%s(o->%s);" % (a.type, a.name), 1) + self.emit("if (!value) goto failed;", 1) + self.emit('if (PyObject_SetAttrString(result, "%s", value) < 0)' % a.name, 1) + self.emit('goto failed;', 2) + self.emit('Py_DECREF(value);', 1) + self.func_end() + + def simpleSum(self, sum, name): + self.emit("PyObject* ast2obj_%s(%s_ty o)" % (name, name), 0) + self.emit("{", 0) + self.emit("switch(o) {", 1) + for t in sum.types: + self.emit("case %s:" % t.name, 2) + self.emit("Py_INCREF(%s_singleton);" % t.name, 3) + self.emit("return %s_singleton;" % t.name, 3) + self.emit("}", 1) + self.emit("return NULL; /* cannot happen */", 1) + self.emit("}", 0) + + def visitProduct(self, prod, name): + self.func_begin(name) + self.emit("result = PyType_GenericNew(%s_type, NULL, NULL);" % name, 1); + self.emit("if (!result) return NULL;", 1) + for field in prod.fields: + self.visitField(field, name, 1, True) + self.func_end() + + def visitConstructor(self, cons, enum, name): + self.emit("case %s_kind:" % cons.name, 1) + self.emit("result = PyType_GenericNew(%s_type, NULL, NULL);" % cons.name, 2); + self.emit("if (!result) goto failed;", 2) + for f in cons.fields: + self.visitField(f, cons.name, 2, False) + self.emit("break;", 2) + + def visitField(self, field, name, depth, product): + def emit(s, d): + self.emit(s, depth + d) + if product: + value = "o->%s" % field.name + else: + value = "o->v.%s.%s" % (name, field.name) + self.set(field, value, depth) + emit("if (!value) goto failed;", 0) + emit('if (PyObject_SetAttrString(result, "%s", value) == -1)' % field.name, 0) + emit("goto failed;", 1) + emit("Py_DECREF(value);", 0) + + def emitSeq(self, field, value, depth, emit): + emit("seq = %s;" % value, 0) + emit("n = asdl_seq_LEN(seq);", 0) + emit("value = PyList_New(n);", 0) + emit("if (!value) goto failed;", 0) + emit("for (i = 0; i < n; i++) {", 0) + self.set("value", field, "asdl_seq_GET(seq, i)", depth + 1) + emit("if (!value1) goto failed;", 1) + emit("PyList_SET_ITEM(value, i, value1);", 1) + emit("value1 = NULL;", 1) + emit("}", 0) + + def set(self, field, value, depth): + if field.seq: + # XXX should really check for is_simple, but that requires a symbol table + if field.type.value == "cmpop": + # While the sequence elements are stored as void*, + # ast2obj_cmpop expects an enum + self.emit("{", depth) + self.emit("int i, n = asdl_seq_LEN(%s);" % value, depth+1) + self.emit("value = PyList_New(n);", depth+1) + self.emit("if (!value) goto failed;", depth+1) + self.emit("for(i = 0; i < n; i++)", depth+1) + # This cannot fail, so no need for error handling + self.emit("PyList_SET_ITEM(value, i, ast2obj_cmpop((cmpop_ty)asdl_seq_GET(%s, i)));" % value, + depth+2, reflow=False) + self.emit("}", depth) + else: + self.emit("value = ast2obj_list(%s, ast2obj_%s);" % (value, field.type), depth) + else: + ctype = get_c_type(field.type) + self.emit("value = ast2obj_%s(%s);" % (field.type, value), depth, reflow=False) + + +class PartingShots(StaticVisitor): + + CODE = """ +PyObject* PyAST_mod2obj(mod_ty t) +{ + init_types(); + return ast2obj_mod(t); +} +""" + +class ChainOfVisitors: + def __init__(self, *visitors): + self.visitors = visitors + + def visit(self, object): + for v in self.visitors: + v.visit(object) + v.emit("", 0) + +def main(srcfile): + argv0 = sys.argv[0] + components = argv0.split(os.sep) + argv0 = os.sep.join(components[-2:]) + auto_gen_msg = '/* File automatically generated by %s */\n' % argv0 + mod = asdl.parse(srcfile) + if not asdl.check(mod): + sys.exit(1) + if INC_DIR: + p = "%s/%s-ast.h" % (INC_DIR, mod.name) + f = open(p, "wb") + print >> f, auto_gen_msg + print >> f, '#include "asdl.h"\n' + c = ChainOfVisitors(TypeDefVisitor(f), + StructVisitor(f), + PrototypeVisitor(f), + ) + c.visit(mod) + print >>f, "PyObject* PyAST_mod2obj(mod_ty t);" + f.close() + + if SRC_DIR: + p = os.path.join(SRC_DIR, str(mod.name) + "-ast.c") + f = open(p, "wb") + print >> f, auto_gen_msg + print >> f, '#include "Python.h"' + print >> f, '#include "%s-ast.h"' % mod.name + print >> f + print >>f, "static PyTypeObject* AST_type;" + v = ChainOfVisitors( + PyTypesDeclareVisitor(f), + PyTypesVisitor(f), + FunctionVisitor(f), + ObjVisitor(f), + ASTModuleVisitor(f), + PartingShots(f), + ) + v.visit(mod) + f.close() + +if __name__ == "__main__": + import sys + import getopt + + INC_DIR = '' + SRC_DIR = '' + opts, args = getopt.getopt(sys.argv[1:], "h:c:") + if len(opts) != 1: + print "Must specify exactly one output file" + sys.exit(1) + for o, v in opts: + if o == '-h': + INC_DIR = v + if o == '-c': + SRC_DIR = v + if len(args) != 1: + print "Must specify single input file" + sys.exit(1) + main(args[0]) diff --git a/sys/src/cmd/python/Parser/bitset.c b/sys/src/cmd/python/Parser/bitset.c new file mode 100644 index 000000000..b5543b81d --- /dev/null +++ b/sys/src/cmd/python/Parser/bitset.c @@ -0,0 +1,66 @@ + +/* Bitset primitives used by the parser generator */ + +#include "pgenheaders.h" +#include "bitset.h" + +bitset +newbitset(int nbits) +{ + int nbytes = NBYTES(nbits); + bitset ss = (char *)PyObject_MALLOC(sizeof(BYTE) * nbytes); + + if (ss == NULL) + Py_FatalError("no mem for bitset"); + + ss += nbytes; + while (--nbytes >= 0) + *--ss = 0; + return ss; +} + +void +delbitset(bitset ss) +{ + PyObject_FREE(ss); +} + +int +addbit(bitset ss, int ibit) +{ + int ibyte = BIT2BYTE(ibit); + BYTE mask = BIT2MASK(ibit); + + if (ss[ibyte] & mask) + return 0; /* Bit already set */ + ss[ibyte] |= mask; + return 1; +} + +#if 0 /* Now a macro */ +int +testbit(bitset ss, int ibit) +{ + return (ss[BIT2BYTE(ibit)] & BIT2MASK(ibit)) != 0; +} +#endif + +int +samebitset(bitset ss1, bitset ss2, int nbits) +{ + int i; + + for (i = NBYTES(nbits); --i >= 0; ) + if (*ss1++ != *ss2++) + return 0; + return 1; +} + +void +mergebitset(bitset ss1, bitset ss2, int nbits) +{ + int i; + + for (i = NBYTES(nbits); --i >= 0; ) + *ss1++ |= *ss2++; +} diff --git a/sys/src/cmd/python/Parser/firstsets.c b/sys/src/cmd/python/Parser/firstsets.c new file mode 100644 index 000000000..00467b31a --- /dev/null +++ b/sys/src/cmd/python/Parser/firstsets.c @@ -0,0 +1,113 @@ + +/* Computation of FIRST stets */ + +#include "pgenheaders.h" +#include "grammar.h" +#include "token.h" + +extern int Py_DebugFlag; + +/* Forward */ +static void calcfirstset(grammar *, dfa *); + +void +addfirstsets(grammar *g) +{ + int i; + dfa *d; + + if (Py_DebugFlag) + printf("Adding FIRST sets ...\n"); + for (i = 0; i < g->g_ndfas; i++) { + d = &g->g_dfa[i]; + if (d->d_first == NULL) + calcfirstset(g, d); + } +} + +static void +calcfirstset(grammar *g, dfa *d) +{ + int i, j; + state *s; + arc *a; + int nsyms; + int *sym; + int nbits; + static bitset dummy; + bitset result; + int type; + dfa *d1; + label *l0; + + if (Py_DebugFlag) + printf("Calculate FIRST set for '%s'\n", d->d_name); + + if (dummy == NULL) + dummy = newbitset(1); + if (d->d_first == dummy) { + fprintf(stderr, "Left-recursion for '%s'\n", d->d_name); + return; + } + if (d->d_first != NULL) { + fprintf(stderr, "Re-calculating FIRST set for '%s' ???\n", + d->d_name); + } + d->d_first = dummy; + + l0 = g->g_ll.ll_label; + nbits = g->g_ll.ll_nlabels; + result = newbitset(nbits); + + sym = (int *)PyObject_MALLOC(sizeof(int)); + if (sym == NULL) + Py_FatalError("no mem for new sym in calcfirstset"); + nsyms = 1; + sym[0] = findlabel(&g->g_ll, d->d_type, (char *)NULL); + + s = &d->d_state[d->d_initial]; + for (i = 0; i < s->s_narcs; i++) { + a = &s->s_arc[i]; + for (j = 0; j < nsyms; j++) { + if (sym[j] == a->a_lbl) + break; + } + if (j >= nsyms) { /* New label */ + sym = (int *)PyObject_REALLOC(sym, + sizeof(int) * (nsyms + 1)); + if (sym == NULL) + Py_FatalError( + "no mem to resize sym in calcfirstset"); + sym[nsyms++] = a->a_lbl; + type = l0[a->a_lbl].lb_type; + if (ISNONTERMINAL(type)) { + d1 = PyGrammar_FindDFA(g, type); + if (d1->d_first == dummy) { + fprintf(stderr, + "Left-recursion below '%s'\n", + d->d_name); + } + else { + if (d1->d_first == NULL) + calcfirstset(g, d1); + mergebitset(result, + d1->d_first, nbits); + } + } + else if (ISTERMINAL(type)) { + addbit(result, a->a_lbl); + } + } + } + d->d_first = result; + if (Py_DebugFlag) { + printf("FIRST set for '%s': {", d->d_name); + for (i = 0; i < nbits; i++) { + if (testbit(result, i)) + printf(" %s", PyGrammar_LabelRepr(&l0[i])); + } + printf(" }\n"); + } + + PyObject_FREE(sym); +} diff --git a/sys/src/cmd/python/Parser/grammar.c b/sys/src/cmd/python/Parser/grammar.c new file mode 100644 index 000000000..9e7c49aab --- /dev/null +++ b/sys/src/cmd/python/Parser/grammar.c @@ -0,0 +1,254 @@ + +/* Grammar implementation */ + +#include "Python.h" +#include "pgenheaders.h" + +#include <ctype.h> + +#include "token.h" +#include "grammar.h" + +#ifdef RISCOS +#include <unixlib.h> +#endif + +extern int Py_DebugFlag; + +grammar * +newgrammar(int start) +{ + grammar *g; + + g = (grammar *)PyObject_MALLOC(sizeof(grammar)); + if (g == NULL) + Py_FatalError("no mem for new grammar"); + g->g_ndfas = 0; + g->g_dfa = NULL; + g->g_start = start; + g->g_ll.ll_nlabels = 0; + g->g_ll.ll_label = NULL; + g->g_accel = 0; + return g; +} + +dfa * +adddfa(grammar *g, int type, char *name) +{ + dfa *d; + + g->g_dfa = (dfa *)PyObject_REALLOC(g->g_dfa, + sizeof(dfa) * (g->g_ndfas + 1)); + if (g->g_dfa == NULL) + Py_FatalError("no mem to resize dfa in adddfa"); + d = &g->g_dfa[g->g_ndfas++]; + d->d_type = type; + d->d_name = strdup(name); + d->d_nstates = 0; + d->d_state = NULL; + d->d_initial = -1; + d->d_first = NULL; + return d; /* Only use while fresh! */ +} + +int +addstate(dfa *d) +{ + state *s; + + d->d_state = (state *)PyObject_REALLOC(d->d_state, + sizeof(state) * (d->d_nstates + 1)); + if (d->d_state == NULL) + Py_FatalError("no mem to resize state in addstate"); + s = &d->d_state[d->d_nstates++]; + s->s_narcs = 0; + s->s_arc = NULL; + s->s_lower = 0; + s->s_upper = 0; + s->s_accel = NULL; + s->s_accept = 0; + return s - d->d_state; +} + +void +addarc(dfa *d, int from, int to, int lbl) +{ + state *s; + arc *a; + + assert(0 <= from && from < d->d_nstates); + assert(0 <= to && to < d->d_nstates); + + s = &d->d_state[from]; + s->s_arc = (arc *)PyObject_REALLOC(s->s_arc, sizeof(arc) * (s->s_narcs + 1)); + if (s->s_arc == NULL) + Py_FatalError("no mem to resize arc list in addarc"); + a = &s->s_arc[s->s_narcs++]; + a->a_lbl = lbl; + a->a_arrow = to; +} + +int +addlabel(labellist *ll, int type, char *str) +{ + int i; + label *lb; + + for (i = 0; i < ll->ll_nlabels; i++) { + if (ll->ll_label[i].lb_type == type && + strcmp(ll->ll_label[i].lb_str, str) == 0) + return i; + } + ll->ll_label = (label *)PyObject_REALLOC(ll->ll_label, + sizeof(label) * (ll->ll_nlabels + 1)); + if (ll->ll_label == NULL) + Py_FatalError("no mem to resize labellist in addlabel"); + lb = &ll->ll_label[ll->ll_nlabels++]; + lb->lb_type = type; + lb->lb_str = strdup(str); + if (Py_DebugFlag) + printf("Label @ %8p, %d: %s\n", ll, ll->ll_nlabels, + PyGrammar_LabelRepr(lb)); + return lb - ll->ll_label; +} + +/* Same, but rather dies than adds */ + +int +findlabel(labellist *ll, int type, char *str) +{ + int i; + + for (i = 0; i < ll->ll_nlabels; i++) { + if (ll->ll_label[i].lb_type == type /*&& + strcmp(ll->ll_label[i].lb_str, str) == 0*/) + return i; + } + fprintf(stderr, "Label %d/'%s' not found\n", type, str); + Py_FatalError("grammar.c:findlabel()"); + return 0; /* Make gcc -Wall happy */ +} + +/* Forward */ +static void translabel(grammar *, label *); + +void +translatelabels(grammar *g) +{ + int i; + +#ifdef Py_DEBUG + printf("Translating labels ...\n"); +#endif + /* Don't translate EMPTY */ + for (i = EMPTY+1; i < g->g_ll.ll_nlabels; i++) + translabel(g, &g->g_ll.ll_label[i]); +} + +static void +translabel(grammar *g, label *lb) +{ + int i; + + if (Py_DebugFlag) + printf("Translating label %s ...\n", PyGrammar_LabelRepr(lb)); + + if (lb->lb_type == NAME) { + for (i = 0; i < g->g_ndfas; i++) { + if (strcmp(lb->lb_str, g->g_dfa[i].d_name) == 0) { + if (Py_DebugFlag) + printf( + "Label %s is non-terminal %d.\n", + lb->lb_str, + g->g_dfa[i].d_type); + lb->lb_type = g->g_dfa[i].d_type; + free(lb->lb_str); + lb->lb_str = NULL; + return; + } + } + for (i = 0; i < (int)N_TOKENS; i++) { + if (strcmp(lb->lb_str, _PyParser_TokenNames[i]) == 0) { + if (Py_DebugFlag) + printf("Label %s is terminal %d.\n", + lb->lb_str, i); + lb->lb_type = i; + free(lb->lb_str); + lb->lb_str = NULL; + return; + } + } + printf("Can't translate NAME label '%s'\n", lb->lb_str); + return; + } + + if (lb->lb_type == STRING) { + if (isalpha(Py_CHARMASK(lb->lb_str[1])) || + lb->lb_str[1] == '_') { + char *p; + char *src; + char *dest; + size_t name_len; + if (Py_DebugFlag) + printf("Label %s is a keyword\n", lb->lb_str); + lb->lb_type = NAME; + src = lb->lb_str + 1; + p = strchr(src, '\''); + if (p) + name_len = p - src; + else + name_len = strlen(src); + dest = (char *)malloc(name_len + 1); + if (!dest) { + printf("Can't alloc dest '%s'\n", src); + return; + } + strncpy(dest, src, name_len); + dest[name_len] = '\0'; + free(lb->lb_str); + lb->lb_str = dest; + } + else if (lb->lb_str[2] == lb->lb_str[0]) { + int type = (int) PyToken_OneChar(lb->lb_str[1]); + if (type != OP) { + lb->lb_type = type; + free(lb->lb_str); + lb->lb_str = NULL; + } + else + printf("Unknown OP label %s\n", + lb->lb_str); + } + else if (lb->lb_str[2] && lb->lb_str[3] == lb->lb_str[0]) { + int type = (int) PyToken_TwoChars(lb->lb_str[1], + lb->lb_str[2]); + if (type != OP) { + lb->lb_type = type; + free(lb->lb_str); + lb->lb_str = NULL; + } + else + printf("Unknown OP label %s\n", + lb->lb_str); + } + else if (lb->lb_str[2] && lb->lb_str[3] && lb->lb_str[4] == lb->lb_str[0]) { + int type = (int) PyToken_ThreeChars(lb->lb_str[1], + lb->lb_str[2], + lb->lb_str[3]); + if (type != OP) { + lb->lb_type = type; + free(lb->lb_str); + lb->lb_str = NULL; + } + else + printf("Unknown OP label %s\n", + lb->lb_str); + } + else + printf("Can't translate STRING label %s\n", + lb->lb_str); + } + else + printf("Can't translate label '%s'\n", + PyGrammar_LabelRepr(lb)); +} diff --git a/sys/src/cmd/python/Parser/grammar.mak b/sys/src/cmd/python/Parser/grammar.mak new file mode 100644 index 000000000..55f028ffb --- /dev/null +++ b/sys/src/cmd/python/Parser/grammar.mak @@ -0,0 +1,45 @@ +# This manages to rebuild graminit.{h, c} under MSVC 6 (Windows), via +# +# nmake /f grammar.mak +# +# You may also need to copy python23.dll into this directory, or get +# it on your search path. +# +# The intermediate files can be nuked afterwards: +# +# nmake /f grammar.mak clean +# +# I don't understand the maze of preprocessor #define's on Windows, and +# as a result this requires linking with python23.lib, so it's of no use +# for bootstrapping (the cause appears to be a useless-- in this +# particular case --pragma in PC\pyconfig.h, which demands that +# python23.lib get linked in). + +LIBS= ..\PCbuild\python25.lib + +CFLAGS= /I ..\Include /I ..\PC /D MS_NO_COREDLL /D PGEN /MD + +GRAMMAR_H= ..\Include\graminit.h +GRAMMAR_C= ..\Python\graminit.c +GRAMMAR_INPUT= ..\Grammar\Grammar + +PGEN= pgen.exe + +POBJS= acceler.obj grammar1.obj listnode.obj node.obj parser.obj \ + parsetok.obj tokenizer.obj bitset.obj metagrammar.obj + +PARSER_OBJS= $(POBJS) myreadline.obj + +PGOBJS= firstsets.obj grammar.obj pgen.obj printgrammar.obj pgenmain.obj + +PGENOBJS= $(POBJS) $(PGOBJS) + +$(GRAMMAR_H) $(GRAMMAR_C): $(PGEN) $(GRAMMAR_INPUT) + $(PGEN) $(GRAMMAR_INPUT) $(GRAMMAR_H) $(GRAMMAR_C) + +$(PGEN): $(PGENOBJS) + $(CC) $(PGENOBJS) $(LIBS) /Fe$(PGEN) + +clean: + del *.obj + del $(PGEN) diff --git a/sys/src/cmd/python/Parser/grammar1.c b/sys/src/cmd/python/Parser/grammar1.c new file mode 100644 index 000000000..b76719a1d --- /dev/null +++ b/sys/src/cmd/python/Parser/grammar1.c @@ -0,0 +1,57 @@ + +/* Grammar subroutines needed by parser */ + +#include "Python.h" +#include "pgenheaders.h" +#include "grammar.h" +#include "token.h" + +/* Return the DFA for the given type */ + +dfa * +PyGrammar_FindDFA(grammar *g, register int type) +{ + register dfa *d; +#if 1 + /* Massive speed-up */ + d = &g->g_dfa[type - NT_OFFSET]; + assert(d->d_type == type); + return d; +#else + /* Old, slow version */ + register int i; + + for (i = g->g_ndfas, d = g->g_dfa; --i >= 0; d++) { + if (d->d_type == type) + return d; + } + assert(0); + /* NOTREACHED */ +#endif +} + +char * +PyGrammar_LabelRepr(label *lb) +{ + static char buf[100]; + + if (lb->lb_type == ENDMARKER) + return "EMPTY"; + else if (ISNONTERMINAL(lb->lb_type)) { + if (lb->lb_str == NULL) { + PyOS_snprintf(buf, sizeof(buf), "NT%d", lb->lb_type); + return buf; + } + else + return lb->lb_str; + } + else { + if (lb->lb_str == NULL) + return _PyParser_TokenNames[lb->lb_type]; + else { + PyOS_snprintf(buf, sizeof(buf), "%.32s(%.32s)", + _PyParser_TokenNames[lb->lb_type], lb->lb_str); + return buf; + } + } +} diff --git a/sys/src/cmd/python/Parser/intrcheck.c b/sys/src/cmd/python/Parser/intrcheck.c new file mode 100644 index 000000000..e0f3252db --- /dev/null +++ b/sys/src/cmd/python/Parser/intrcheck.c @@ -0,0 +1,176 @@ + +/* Check for interrupts */ + +#include "Python.h" + +#ifdef QUICKWIN + +#include <io.h> + +void +PyOS_InitInterrupts(void) +{ +} + +void +PyOS_FiniInterrupts(void) +{ +} + +int +PyOS_InterruptOccurred(void) +{ + _wyield(); +} + +#define OK + +#endif /* QUICKWIN */ + +#if defined(_M_IX86) && !defined(__QNX__) +#include <io.h> +#endif + +#if defined(MSDOS) && !defined(QUICKWIN) + +#ifdef __GNUC__ + +/* This is for DJGPP's GO32 extender. I don't know how to trap + * control-C (There's no API for ctrl-C, and I don't want to mess with + * the interrupt vectors.) However, this DOES catch control-break. + * --Amrit + */ + +#include <go32.h> + +void +PyOS_InitInterrupts(void) +{ + _go32_want_ctrl_break(1 /* TRUE */); +} + +void +PyOS_FiniInterrupts(void) +{ +} + +int +PyOS_InterruptOccurred(void) +{ + return _go32_was_ctrl_break_hit(); +} + +#else /* !__GNUC__ */ + +/* This might work for MS-DOS (untested though): */ + +void +PyOS_InitInterrupts(void) +{ +} + +void +PyOS_FiniInterrupts(void) +{ +} + +int +PyOS_InterruptOccurred(void) +{ + int interrupted = 0; + while (kbhit()) { + if (getch() == '\003') + interrupted = 1; + } + return interrupted; +} + +#endif /* __GNUC__ */ + +#define OK + +#endif /* MSDOS && !QUICKWIN */ + + +#ifndef OK + +/* Default version -- for real operating systems and for Standard C */ + +#include <stdio.h> +#include <string.h> +#include <signal.h> + +static int interrupted; + +void +PyErr_SetInterrupt(void) +{ + interrupted = 1; +} + +extern int PyErr_CheckSignals(void); + +static int +checksignals_witharg(void * arg) +{ + return PyErr_CheckSignals(); +} + +static void +intcatcher(int sig) +{ + extern void Py_Exit(int); + static char message[] = +"python: to interrupt a truly hanging Python program, interrupt once more.\n"; + switch (interrupted++) { + case 0: + break; + case 1: +#ifdef RISCOS + fprintf(stderr, message); +#else + write(2, message, strlen(message)); +#endif + break; + case 2: + interrupted = 0; + Py_Exit(1); + break; + } + PyOS_setsig(SIGINT, intcatcher); + Py_AddPendingCall(checksignals_witharg, NULL); +} + +static void (*old_siginthandler)(int) = SIG_DFL; + +void +PyOS_InitInterrupts(void) +{ + if ((old_siginthandler = PyOS_setsig(SIGINT, SIG_IGN)) != SIG_IGN) + PyOS_setsig(SIGINT, intcatcher); +} + +void +PyOS_FiniInterrupts(void) +{ + PyOS_setsig(SIGINT, old_siginthandler); +} + +int +PyOS_InterruptOccurred(void) +{ + if (!interrupted) + return 0; + interrupted = 0; + return 1; +} + +#endif /* !OK */ + +void +PyOS_AfterFork(void) +{ +#ifdef WITH_THREAD + PyEval_ReInitThreads(); +#endif +} diff --git a/sys/src/cmd/python/Parser/listnode.c b/sys/src/cmd/python/Parser/listnode.c new file mode 100644 index 000000000..c0b3b666f --- /dev/null +++ b/sys/src/cmd/python/Parser/listnode.c @@ -0,0 +1,66 @@ + +/* List a node on a file */ + +#include "pgenheaders.h" +#include "token.h" +#include "node.h" + +/* Forward */ +static void list1node(FILE *, node *); +static void listnode(FILE *, node *); + +void +PyNode_ListTree(node *n) +{ + listnode(stdout, n); +} + +static int level, atbol; + +static void +listnode(FILE *fp, node *n) +{ + level = 0; + atbol = 1; + list1node(fp, n); +} + +static void +list1node(FILE *fp, node *n) +{ + if (n == 0) + return; + if (ISNONTERMINAL(TYPE(n))) { + int i; + for (i = 0; i < NCH(n); i++) + list1node(fp, CHILD(n, i)); + } + else if (ISTERMINAL(TYPE(n))) { + switch (TYPE(n)) { + case INDENT: + ++level; + break; + case DEDENT: + --level; + break; + default: + if (atbol) { + int i; + for (i = 0; i < level; ++i) + fprintf(fp, "\t"); + atbol = 0; + } + if (TYPE(n) == NEWLINE) { + if (STR(n) != NULL) + fprintf(fp, "%s", STR(n)); + fprintf(fp, "\n"); + atbol = 1; + } + else + fprintf(fp, "%s ", STR(n)); + break; + } + } + else + fprintf(fp, "? "); +} diff --git a/sys/src/cmd/python/Parser/metagrammar.c b/sys/src/cmd/python/Parser/metagrammar.c new file mode 100644 index 000000000..b61bc6d66 --- /dev/null +++ b/sys/src/cmd/python/Parser/metagrammar.c @@ -0,0 +1,159 @@ + +#include "pgenheaders.h" +#include "metagrammar.h" +#include "grammar.h" +#include "pgen.h" +static arc arcs_0_0[3] = { + {2, 0}, + {3, 0}, + {4, 1}, +}; +static arc arcs_0_1[1] = { + {0, 1}, +}; +static state states_0[2] = { + {3, arcs_0_0}, + {1, arcs_0_1}, +}; +static arc arcs_1_0[1] = { + {5, 1}, +}; +static arc arcs_1_1[1] = { + {6, 2}, +}; +static arc arcs_1_2[1] = { + {7, 3}, +}; +static arc arcs_1_3[1] = { + {3, 4}, +}; +static arc arcs_1_4[1] = { + {0, 4}, +}; +static state states_1[5] = { + {1, arcs_1_0}, + {1, arcs_1_1}, + {1, arcs_1_2}, + {1, arcs_1_3}, + {1, arcs_1_4}, +}; +static arc arcs_2_0[1] = { + {8, 1}, +}; +static arc arcs_2_1[2] = { + {9, 0}, + {0, 1}, +}; +static state states_2[2] = { + {1, arcs_2_0}, + {2, arcs_2_1}, +}; +static arc arcs_3_0[1] = { + {10, 1}, +}; +static arc arcs_3_1[2] = { + {10, 1}, + {0, 1}, +}; +static state states_3[2] = { + {1, arcs_3_0}, + {2, arcs_3_1}, +}; +static arc arcs_4_0[2] = { + {11, 1}, + {13, 2}, +}; +static arc arcs_4_1[1] = { + {7, 3}, +}; +static arc arcs_4_2[3] = { + {14, 4}, + {15, 4}, + {0, 2}, +}; +static arc arcs_4_3[1] = { + {12, 4}, +}; +static arc arcs_4_4[1] = { + {0, 4}, +}; +static state states_4[5] = { + {2, arcs_4_0}, + {1, arcs_4_1}, + {3, arcs_4_2}, + {1, arcs_4_3}, + {1, arcs_4_4}, +}; +static arc arcs_5_0[3] = { + {5, 1}, + {16, 1}, + {17, 2}, +}; +static arc arcs_5_1[1] = { + {0, 1}, +}; +static arc arcs_5_2[1] = { + {7, 3}, +}; +static arc arcs_5_3[1] = { + {18, 1}, +}; +static state states_5[4] = { + {3, arcs_5_0}, + {1, arcs_5_1}, + {1, arcs_5_2}, + {1, arcs_5_3}, +}; +static dfa dfas[6] = { + {256, "MSTART", 0, 2, states_0, + "\070\000\000"}, + {257, "RULE", 0, 5, states_1, + "\040\000\000"}, + {258, "RHS", 0, 2, states_2, + "\040\010\003"}, + {259, "ALT", 0, 2, states_3, + "\040\010\003"}, + {260, "ITEM", 0, 5, states_4, + "\040\010\003"}, + {261, "ATOM", 0, 4, states_5, + "\040\000\003"}, +}; +static label labels[19] = { + {0, "EMPTY"}, + {256, 0}, + {257, 0}, + {4, 0}, + {0, 0}, + {1, 0}, + {11, 0}, + {258, 0}, + {259, 0}, + {18, 0}, + {260, 0}, + {9, 0}, + {10, 0}, + {261, 0}, + {16, 0}, + {14, 0}, + {3, 0}, + {7, 0}, + {8, 0}, +}; +static grammar _PyParser_Grammar = { + 6, + dfas, + {19, labels}, + 256 +}; + +grammar * +meta_grammar(void) +{ + return &_PyParser_Grammar; +} + +grammar * +Py_meta_grammar(void) +{ + return meta_grammar(); +} diff --git a/sys/src/cmd/python/Parser/mkfile b/sys/src/cmd/python/Parser/mkfile new file mode 100644 index 000000000..01557576e --- /dev/null +++ b/sys/src/cmd/python/Parser/mkfile @@ -0,0 +1,44 @@ +APE=/sys/src/ape +<$APE/config + +LIB=/$objtype/lib/ape/libpython.a + +OFILES=\ + acceler.$O\ + bitset.$O\ + firstsets.$O\ + grammar1.$O\ + listnode.$O\ + metagrammar.$O\ + myreadline.$O\ + node.$O\ + parser.$O\ + parsetok.$O\ + tokenizer.$O\ + +CLEANFILES=$O.pgen + +</sys/src/cmd/mksyslib + +CFLAGS=-c -I.. -I../Include -DT$objtype -DPy_BUILD_CORE -DNDEBUG + +PGENOFILES=\ + acceler.$O\ + bitset.$O\ + firstsets.$O\ + grammar.$O\ + grammar1.$O\ + listnode.$O\ + metagrammar.$O\ + node.$O\ + parser.$O\ + parsetok.$O\ + pgen.$O\ + pgenmain.$O\ + printgrammar.$O\ + tokenizer_pgen.$O\ + ../Objects/obmalloc.$O\ + ../Python/mysnprintf.$O\ + +$O.pgen: $PGENOFILES + $LD -o $target $prereq diff --git a/sys/src/cmd/python/Parser/myreadline.c b/sys/src/cmd/python/Parser/myreadline.c new file mode 100644 index 000000000..32a1088ad --- /dev/null +++ b/sys/src/cmd/python/Parser/myreadline.c @@ -0,0 +1,219 @@ + +/* Readline interface for tokenizer.c and [raw_]input() in bltinmodule.c. + By default, or when stdin is not a tty device, we have a super + simple my_readline function using fgets. + Optionally, we can use the GNU readline library. + my_readline() has a different return value from GNU readline(): + - NULL if an interrupt occurred or if an error occurred + - a malloc'ed empty string if EOF was read + - a malloc'ed string ending in \n normally +*/ + +#include "Python.h" +#ifdef MS_WINDOWS +#define WIN32_LEAN_AND_MEAN +#include "windows.h" +#endif /* MS_WINDOWS */ + +#ifdef __VMS +extern char* vms__StdioReadline(FILE *sys_stdin, FILE *sys_stdout, char *prompt); +#endif + + +PyThreadState* _PyOS_ReadlineTState; + +#ifdef WITH_THREAD +#include "pythread.h" +static PyThread_type_lock _PyOS_ReadlineLock = NULL; +#endif + +int (*PyOS_InputHook)(void) = NULL; + +#ifdef RISCOS +int Py_RISCOSWimpFlag; +#endif + +/* This function restarts a fgets() after an EINTR error occurred + except if PyOS_InterruptOccurred() returns true. */ + +static int +my_fgets(char *buf, int len, FILE *fp) +{ + char *p; + for (;;) { + if (PyOS_InputHook != NULL) + (void)(PyOS_InputHook)(); + errno = 0; + p = fgets(buf, len, fp); + if (p != NULL) + return 0; /* No error */ +#ifdef MS_WINDOWS + /* In the case of a Ctrl+C or some other external event + interrupting the operation: + Win2k/NT: ERROR_OPERATION_ABORTED is the most recent Win32 + error code (and feof() returns TRUE). + Win9x: Ctrl+C seems to have no effect on fgets() returning + early - the signal handler is called, but the fgets() + only returns "normally" (ie, when Enter hit or feof()) + */ + if (GetLastError()==ERROR_OPERATION_ABORTED) { + /* Signals come asynchronously, so we sleep a brief + moment before checking if the handler has been + triggered (we cant just return 1 before the + signal handler has been called, as the later + signal may be treated as a separate interrupt). + */ + Sleep(1); + if (PyOS_InterruptOccurred()) { + return 1; /* Interrupt */ + } + /* Either the sleep wasn't long enough (need a + short loop retrying?) or not interrupted at all + (in which case we should revisit the whole thing!) + Logging some warning would be nice. assert is not + viable as under the debugger, the various dialogs + mean the condition is not true. + */ + } +#endif /* MS_WINDOWS */ + if (feof(fp)) { + return -1; /* EOF */ + } +#ifdef EINTR + if (errno == EINTR) { + int s; +#ifdef WITH_THREAD + PyEval_RestoreThread(_PyOS_ReadlineTState); +#endif + s = PyErr_CheckSignals(); +#ifdef WITH_THREAD + PyEval_SaveThread(); +#endif + if (s < 0) { + return 1; + } + } +#endif + if (PyOS_InterruptOccurred()) { + return 1; /* Interrupt */ + } + return -2; /* Error */ + } + /* NOTREACHED */ +} + + +/* Readline implementation using fgets() */ + +char * +PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, char *prompt) +{ + size_t n; + char *p; + n = 100; + if ((p = (char *)PyMem_MALLOC(n)) == NULL) + return NULL; + fflush(sys_stdout); +#ifndef RISCOS + if (prompt) + fprintf(stderr, "%s", prompt); +#else + if (prompt) { + if(Py_RISCOSWimpFlag) + fprintf(stderr, "\x0cr%s\x0c", prompt); + else + fprintf(stderr, "%s", prompt); + } +#endif + fflush(stderr); + switch (my_fgets(p, (int)n, sys_stdin)) { + case 0: /* Normal case */ + break; + case 1: /* Interrupt */ + PyMem_FREE(p); + return NULL; + case -1: /* EOF */ + case -2: /* Error */ + default: /* Shouldn't happen */ + *p = '\0'; + break; + } + n = strlen(p); + while (n > 0 && p[n-1] != '\n') { + size_t incr = n+2; + p = (char *)PyMem_REALLOC(p, n + incr); + if (p == NULL) + return NULL; + if (incr > INT_MAX) { + PyErr_SetString(PyExc_OverflowError, "input line too long"); + } + if (my_fgets(p+n, (int)incr, sys_stdin) != 0) + break; + n += strlen(p+n); + } + return (char *)PyMem_REALLOC(p, n+1); +} + + +/* By initializing this function pointer, systems embedding Python can + override the readline function. + + Note: Python expects in return a buffer allocated with PyMem_Malloc. */ + +char *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, char *); + + +/* Interface used by tokenizer.c and bltinmodule.c */ + +char * +PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, char *prompt) +{ + char *rv; + + if (_PyOS_ReadlineTState == PyThreadState_GET()) { + PyErr_SetString(PyExc_RuntimeError, + "can't re-enter readline"); + return NULL; + } + + + if (PyOS_ReadlineFunctionPointer == NULL) { +#ifdef __VMS + PyOS_ReadlineFunctionPointer = vms__StdioReadline; +#else + PyOS_ReadlineFunctionPointer = PyOS_StdioReadline; +#endif + } + +#ifdef WITH_THREAD + if (_PyOS_ReadlineLock == NULL) { + _PyOS_ReadlineLock = PyThread_allocate_lock(); + } +#endif + + _PyOS_ReadlineTState = PyThreadState_GET(); + Py_BEGIN_ALLOW_THREADS +#ifdef WITH_THREAD + PyThread_acquire_lock(_PyOS_ReadlineLock, 1); +#endif + + /* This is needed to handle the unlikely case that the + * interpreter is in interactive mode *and* stdin/out are not + * a tty. This can happen, for example if python is run like + * this: python -i < test1.py + */ + if (!isatty (fileno (sys_stdin)) || !isatty (fileno (sys_stdout))) + rv = PyOS_StdioReadline (sys_stdin, sys_stdout, prompt); + else + rv = (*PyOS_ReadlineFunctionPointer)(sys_stdin, sys_stdout, + prompt); + Py_END_ALLOW_THREADS + +#ifdef WITH_THREAD + PyThread_release_lock(_PyOS_ReadlineLock); +#endif + + _PyOS_ReadlineTState = NULL; + + return rv; +} diff --git a/sys/src/cmd/python/Parser/node.c b/sys/src/cmd/python/Parser/node.c new file mode 100644 index 000000000..d133a0d17 --- /dev/null +++ b/sys/src/cmd/python/Parser/node.c @@ -0,0 +1,135 @@ +/* Parse tree node implementation */ + +#include "Python.h" +#include "node.h" +#include "errcode.h" + +node * +PyNode_New(int type) +{ + node *n = (node *) PyObject_MALLOC(1 * sizeof(node)); + if (n == NULL) + return NULL; + n->n_type = type; + n->n_str = NULL; + n->n_lineno = 0; + n->n_nchildren = 0; + n->n_child = NULL; + return n; +} + +/* See comments at XXXROUNDUP below. Returns -1 on overflow. */ +static int +fancy_roundup(int n) +{ + /* Round up to the closest power of 2 >= n. */ + int result = 256; + assert(n > 128); + while (result < n) { + result <<= 1; + if (result <= 0) + return -1; + } + return result; +} + +/* A gimmick to make massive numbers of reallocs quicker. The result is + * a number >= the input. In PyNode_AddChild, it's used like so, when + * we're about to add child number current_size + 1: + * + * if XXXROUNDUP(current_size) < XXXROUNDUP(current_size + 1): + * allocate space for XXXROUNDUP(current_size + 1) total children + * else: + * we already have enough space + * + * Since a node starts out empty, we must have + * + * XXXROUNDUP(0) < XXXROUNDUP(1) + * + * so that we allocate space for the first child. One-child nodes are very + * common (presumably that would change if we used a more abstract form + * of syntax tree), so to avoid wasting memory it's desirable that + * XXXROUNDUP(1) == 1. That in turn forces XXXROUNDUP(0) == 0. + * + * Else for 2 <= n <= 128, we round up to the closest multiple of 4. Why 4? + * Rounding up to a multiple of an exact power of 2 is very efficient, and + * most nodes with more than one child have <= 4 kids. + * + * Else we call fancy_roundup() to grow proportionately to n. We've got an + * extreme case then (like test_longexp.py), and on many platforms doing + * anything less than proportional growth leads to exorbitant runtime + * (e.g., MacPython), or extreme fragmentation of user address space (e.g., + * Win98). + * + * In a run of compileall across the 2.3a0 Lib directory, Andrew MacIntyre + * reported that, with this scheme, 89% of PyObject_REALLOC calls in + * PyNode_AddChild passed 1 for the size, and 9% passed 4. So this usually + * wastes very little memory, but is very effective at sidestepping + * platform-realloc disasters on vulnerable platforms. + * + * Note that this would be straightforward if a node stored its current + * capacity. The code is tricky to avoid that. + */ +#define XXXROUNDUP(n) ((n) <= 1 ? (n) : \ + (n) <= 128 ? (((n) + 3) & ~3) : \ + fancy_roundup(n)) + + +int +PyNode_AddChild(register node *n1, int type, char *str, int lineno, int col_offset) +{ + const int nch = n1->n_nchildren; + int current_capacity; + int required_capacity; + node *n; + + if (nch == INT_MAX || nch < 0) + return E_OVERFLOW; + + current_capacity = XXXROUNDUP(nch); + required_capacity = XXXROUNDUP(nch + 1); + if (current_capacity < 0 || required_capacity < 0) + return E_OVERFLOW; + if (current_capacity < required_capacity) { + n = n1->n_child; + n = (node *) PyObject_REALLOC(n, + required_capacity * sizeof(node)); + if (n == NULL) + return E_NOMEM; + n1->n_child = n; + } + + n = &n1->n_child[n1->n_nchildren++]; + n->n_type = type; + n->n_str = str; + n->n_lineno = lineno; + n->n_col_offset = col_offset; + n->n_nchildren = 0; + n->n_child = NULL; + return 0; +} + +/* Forward */ +static void freechildren(node *); + + +void +PyNode_Free(node *n) +{ + if (n != NULL) { + freechildren(n); + PyObject_FREE(n); + } +} + +static void +freechildren(node *n) +{ + int i; + for (i = NCH(n); --i >= 0; ) + freechildren(CHILD(n, i)); + if (n->n_child != NULL) + PyObject_FREE(n->n_child); + if (STR(n) != NULL) + PyObject_FREE(STR(n)); +} diff --git a/sys/src/cmd/python/Parser/parser.c b/sys/src/cmd/python/Parser/parser.c new file mode 100644 index 000000000..2ce84cd32 --- /dev/null +++ b/sys/src/cmd/python/Parser/parser.c @@ -0,0 +1,433 @@ + +/* Parser implementation */ + +/* For a description, see the comments at end of this file */ + +/* XXX To do: error recovery */ + +#include "Python.h" +#include "pgenheaders.h" +#include "token.h" +#include "grammar.h" +#include "node.h" +#include "parser.h" +#include "errcode.h" + + +#ifdef Py_DEBUG +extern int Py_DebugFlag; +#define D(x) if (!Py_DebugFlag); else x +#else +#define D(x) +#endif + + +/* STACK DATA TYPE */ + +static void s_reset(stack *); + +static void +s_reset(stack *s) +{ + s->s_top = &s->s_base[MAXSTACK]; +} + +#define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK]) + +static int +s_push(register stack *s, dfa *d, node *parent) +{ + register stackentry *top; + if (s->s_top == s->s_base) { + fprintf(stderr, "s_push: parser stack overflow\n"); + return E_NOMEM; + } + top = --s->s_top; + top->s_dfa = d; + top->s_parent = parent; + top->s_state = 0; + return 0; +} + +#ifdef Py_DEBUG + +static void +s_pop(register stack *s) +{ + if (s_empty(s)) + Py_FatalError("s_pop: parser stack underflow -- FATAL"); + s->s_top++; +} + +#else /* !Py_DEBUG */ + +#define s_pop(s) (s)->s_top++ + +#endif + + +/* PARSER CREATION */ + +parser_state * +PyParser_New(grammar *g, int start) +{ + parser_state *ps; + + if (!g->g_accel) + PyGrammar_AddAccelerators(g); + ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state)); + if (ps == NULL) + return NULL; + ps->p_grammar = g; +#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD + ps->p_flags = 0; +#endif + ps->p_tree = PyNode_New(start); + if (ps->p_tree == NULL) { + PyMem_FREE(ps); + return NULL; + } + s_reset(&ps->p_stack); + (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree); + return ps; +} + +void +PyParser_Delete(parser_state *ps) +{ + /* NB If you want to save the parse tree, + you must set p_tree to NULL before calling delparser! */ + PyNode_Free(ps->p_tree); + PyMem_FREE(ps); +} + + +/* PARSER STACK OPERATIONS */ + +static int +shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset) +{ + int err; + assert(!s_empty(s)); + err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset); + if (err) + return err; + s->s_top->s_state = newstate; + return 0; +} + +static int +push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset) +{ + int err; + register node *n; + n = s->s_top->s_parent; + assert(!s_empty(s)); + err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset); + if (err) + return err; + s->s_top->s_state = newstate; + return s_push(s, d, CHILD(n, NCH(n)-1)); +} + + +/* PARSER PROPER */ + +static int +classify(parser_state *ps, int type, char *str) +{ + grammar *g = ps->p_grammar; + register int n = g->g_ll.ll_nlabels; + + if (type == NAME) { + register char *s = str; + register label *l = g->g_ll.ll_label; + register int i; + for (i = n; i > 0; i--, l++) { + if (l->lb_type != NAME || l->lb_str == NULL || + l->lb_str[0] != s[0] || + strcmp(l->lb_str, s) != 0) + continue; +#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD + if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) { + if (s[0] == 'w' && strcmp(s, "with") == 0) + break; /* not a keyword yet */ + else if (s[0] == 'a' && strcmp(s, "as") == 0) + break; /* not a keyword yet */ + } +#endif + D(printf("It's a keyword\n")); + return n - i; + } + } + + { + register label *l = g->g_ll.ll_label; + register int i; + for (i = n; i > 0; i--, l++) { + if (l->lb_type == type && l->lb_str == NULL) { + D(printf("It's a token we know\n")); + return n - i; + } + } + } + + D(printf("Illegal token\n")); + return -1; +} + +#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD +static void +future_hack(parser_state *ps) +{ + node *n = ps->p_stack.s_top->s_parent; + node *ch, *cch; + int i; + + /* from __future__ import ..., must have at least 4 children */ + n = CHILD(n, 0); + if (NCH(n) < 4) + return; + ch = CHILD(n, 0); + if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0) + return; + ch = CHILD(n, 1); + if (NCH(ch) == 1 && STR(CHILD(ch, 0)) && + strcmp(STR(CHILD(ch, 0)), "__future__") != 0) + return; + ch = CHILD(n, 3); + /* ch can be a star, a parenthesis or import_as_names */ + if (TYPE(ch) == STAR) + return; + if (TYPE(ch) == LPAR) + ch = CHILD(n, 4); + + for (i = 0; i < NCH(ch); i += 2) { + cch = CHILD(ch, i); + if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME && + strcmp(STR(CHILD(cch, 0)), "with_statement") == 0) { + ps->p_flags |= CO_FUTURE_WITH_STATEMENT; + break; + } + } +} +#endif /* future keyword */ + +int +PyParser_AddToken(register parser_state *ps, register int type, char *str, + int lineno, int col_offset, int *expected_ret) +{ + register int ilabel; + int err; + + D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str)); + + /* Find out which label this token is */ + ilabel = classify(ps, type, str); + if (ilabel < 0) + return E_SYNTAX; + + /* Loop until the token is shifted or an error occurred */ + for (;;) { + /* Fetch the current dfa and state */ + register dfa *d = ps->p_stack.s_top->s_dfa; + register state *s = &d->d_state[ps->p_stack.s_top->s_state]; + + D(printf(" DFA '%s', state %d:", + d->d_name, ps->p_stack.s_top->s_state)); + + /* Check accelerator */ + if (s->s_lower <= ilabel && ilabel < s->s_upper) { + register int x = s->s_accel[ilabel - s->s_lower]; + if (x != -1) { + if (x & (1<<7)) { + /* Push non-terminal */ + int nt = (x >> 8) + NT_OFFSET; + int arrow = x & ((1<<7)-1); + dfa *d1 = PyGrammar_FindDFA( + ps->p_grammar, nt); + if ((err = push(&ps->p_stack, nt, d1, + arrow, lineno, col_offset)) > 0) { + D(printf(" MemError: push\n")); + return err; + } + D(printf(" Push ...\n")); + continue; + } + + /* Shift the token */ + if ((err = shift(&ps->p_stack, type, str, + x, lineno, col_offset)) > 0) { + D(printf(" MemError: shift.\n")); + return err; + } + D(printf(" Shift.\n")); + /* Pop while we are in an accept-only state */ + while (s = &d->d_state + [ps->p_stack.s_top->s_state], + s->s_accept && s->s_narcs == 1) { + D(printf(" DFA '%s', state %d: " + "Direct pop.\n", + d->d_name, + ps->p_stack.s_top->s_state)); +#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD + if (d->d_name[0] == 'i' && + strcmp(d->d_name, + "import_stmt") == 0) + future_hack(ps); +#endif + s_pop(&ps->p_stack); + if (s_empty(&ps->p_stack)) { + D(printf(" ACCEPT.\n")); + return E_DONE; + } + d = ps->p_stack.s_top->s_dfa; + } + return E_OK; + } + } + + if (s->s_accept) { +#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD + if (d->d_name[0] == 'i' && + strcmp(d->d_name, "import_stmt") == 0) + future_hack(ps); +#endif + /* Pop this dfa and try again */ + s_pop(&ps->p_stack); + D(printf(" Pop ...\n")); + if (s_empty(&ps->p_stack)) { + D(printf(" Error: bottom of stack.\n")); + return E_SYNTAX; + } + continue; + } + + /* Stuck, report syntax error */ + D(printf(" Error.\n")); + if (expected_ret) { + if (s->s_lower == s->s_upper - 1) { + /* Only one possible expected token */ + *expected_ret = ps->p_grammar-> + g_ll.ll_label[s->s_lower].lb_type; + } + else + *expected_ret = -1; + } + return E_SYNTAX; + } +} + + +#ifdef Py_DEBUG + +/* DEBUG OUTPUT */ + +void +dumptree(grammar *g, node *n) +{ + int i; + + if (n == NULL) + printf("NIL"); + else { + label l; + l.lb_type = TYPE(n); + l.lb_str = STR(n); + printf("%s", PyGrammar_LabelRepr(&l)); + if (ISNONTERMINAL(TYPE(n))) { + printf("("); + for (i = 0; i < NCH(n); i++) { + if (i > 0) + printf(","); + dumptree(g, CHILD(n, i)); + } + printf(")"); + } + } +} + +void +showtree(grammar *g, node *n) +{ + int i; + + if (n == NULL) + return; + if (ISNONTERMINAL(TYPE(n))) { + for (i = 0; i < NCH(n); i++) + showtree(g, CHILD(n, i)); + } + else if (ISTERMINAL(TYPE(n))) { + printf("%s", _PyParser_TokenNames[TYPE(n)]); + if (TYPE(n) == NUMBER || TYPE(n) == NAME) + printf("(%s)", STR(n)); + printf(" "); + } + else + printf("? "); +} + +void +printtree(parser_state *ps) +{ + if (Py_DebugFlag) { + printf("Parse tree:\n"); + dumptree(ps->p_grammar, ps->p_tree); + printf("\n"); + printf("Tokens:\n"); + showtree(ps->p_grammar, ps->p_tree); + printf("\n"); + } + printf("Listing:\n"); + PyNode_ListTree(ps->p_tree); + printf("\n"); +} + +#endif /* Py_DEBUG */ + +/* + +Description +----------- + +The parser's interface is different than usual: the function addtoken() +must be called for each token in the input. This makes it possible to +turn it into an incremental parsing system later. The parsing system +constructs a parse tree as it goes. + +A parsing rule is represented as a Deterministic Finite-state Automaton +(DFA). A node in a DFA represents a state of the parser; an arc represents +a transition. Transitions are either labeled with terminal symbols or +with non-terminals. When the parser decides to follow an arc labeled +with a non-terminal, it is invoked recursively with the DFA representing +the parsing rule for that as its initial state; when that DFA accepts, +the parser that invoked it continues. The parse tree constructed by the +recursively called parser is inserted as a child in the current parse tree. + +The DFA's can be constructed automatically from a more conventional +language description. An extended LL(1) grammar (ELL(1)) is suitable. +Certain restrictions make the parser's life easier: rules that can produce +the empty string should be outlawed (there are other ways to put loops +or optional parts in the language). To avoid the need to construct +FIRST sets, we can require that all but the last alternative of a rule +(really: arc going out of a DFA's state) must begin with a terminal +symbol. + +As an example, consider this grammar: + +expr: term (OP term)* +term: CONSTANT | '(' expr ')' + +The DFA corresponding to the rule for expr is: + +------->.---term-->.-------> + ^ | + | | + \----OP----/ + +The parse tree generated for the input a+b is: + +(expr: (term: (NAME: a)), (OP: +), (term: (NAME: b))) + +*/ diff --git a/sys/src/cmd/python/Parser/parser.h b/sys/src/cmd/python/Parser/parser.h new file mode 100644 index 000000000..bdca3e944 --- /dev/null +++ b/sys/src/cmd/python/Parser/parser.h @@ -0,0 +1,42 @@ +#ifndef Py_PARSER_H +#define Py_PARSER_H +#ifdef __cplusplus +extern "C" { +#endif + + +/* Parser interface */ + +#define MAXSTACK 500 + +typedef struct { + int s_state; /* State in current DFA */ + dfa *s_dfa; /* Current DFA */ + struct _node *s_parent; /* Where to add next node */ +} stackentry; + +typedef struct { + stackentry *s_top; /* Top entry */ + stackentry s_base[MAXSTACK];/* Array of stack entries */ + /* NB The stack grows down */ +} stack; + +typedef struct { + stack p_stack; /* Stack of parser states */ + grammar *p_grammar; /* Grammar to use */ + node *p_tree; /* Top of parse tree */ +#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD + unsigned long p_flags; /* see co_flags in Include/code.h */ +#endif +} parser_state; + +parser_state *PyParser_New(grammar *g, int start); +void PyParser_Delete(parser_state *ps); +int PyParser_AddToken(parser_state *ps, int type, char *str, int lineno, int col_offset, + int *expected_ret); +void PyGrammar_AddAccelerators(grammar *g); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_PARSER_H */ diff --git a/sys/src/cmd/python/Parser/parsetok.c b/sys/src/cmd/python/Parser/parsetok.c new file mode 100644 index 000000000..be53e1c59 --- /dev/null +++ b/sys/src/cmd/python/Parser/parsetok.c @@ -0,0 +1,260 @@ + +/* Parser-tokenizer link implementation */ + +#include "pgenheaders.h" +#include "tokenizer.h" +#include "node.h" +#include "grammar.h" +#include "parser.h" +#include "parsetok.h" +#include "errcode.h" +#include "graminit.h" + +int Py_TabcheckFlag; + + +/* Forward */ +static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int); +static void initerr(perrdetail *err_ret, const char* filename); + +/* Parse input coming from a string. Return error code, print some errors. */ +node * +PyParser_ParseString(const char *s, grammar *g, int start, perrdetail *err_ret) +{ + return PyParser_ParseStringFlagsFilename(s, NULL, g, start, err_ret, 0); +} + +node * +PyParser_ParseStringFlags(const char *s, grammar *g, int start, + perrdetail *err_ret, int flags) +{ + return PyParser_ParseStringFlagsFilename(s, NULL, + g, start, err_ret, flags); +} + +node * +PyParser_ParseStringFlagsFilename(const char *s, const char *filename, + grammar *g, int start, + perrdetail *err_ret, int flags) +{ + struct tok_state *tok; + + initerr(err_ret, filename); + + if ((tok = PyTokenizer_FromString(s)) == NULL) { + err_ret->error = PyErr_Occurred() ? E_DECODE : E_NOMEM; + return NULL; + } + + tok->filename = filename ? filename : "<string>"; + if (Py_TabcheckFlag || Py_VerboseFlag) { + tok->altwarning = (tok->filename != NULL); + if (Py_TabcheckFlag >= 2) + tok->alterror++; + } + + return parsetok(tok, g, start, err_ret, flags); +} + +/* Parse input coming from a file. Return error code, print some errors. */ + +node * +PyParser_ParseFile(FILE *fp, const char *filename, grammar *g, int start, + char *ps1, char *ps2, perrdetail *err_ret) +{ + return PyParser_ParseFileFlags(fp, filename, g, start, ps1, ps2, + err_ret, 0); +} + +node * +PyParser_ParseFileFlags(FILE *fp, const char *filename, grammar *g, int start, + char *ps1, char *ps2, perrdetail *err_ret, int flags) +{ + struct tok_state *tok; + + initerr(err_ret, filename); + + if ((tok = PyTokenizer_FromFile(fp, ps1, ps2)) == NULL) { + err_ret->error = E_NOMEM; + return NULL; + } + tok->filename = filename; + if (Py_TabcheckFlag || Py_VerboseFlag) { + tok->altwarning = (filename != NULL); + if (Py_TabcheckFlag >= 2) + tok->alterror++; + } + + + return parsetok(tok, g, start, err_ret, flags); +} + +/* Parse input coming from the given tokenizer structure. + Return error code. */ + +static char with_msg[] = +"%s:%d: Warning: 'with' will become a reserved keyword in Python 2.6\n"; + +static char as_msg[] = +"%s:%d: Warning: 'as' will become a reserved keyword in Python 2.6\n"; + +static void +warn(const char *msg, const char *filename, int lineno) +{ + if (filename == NULL) + filename = "<string>"; + PySys_WriteStderr(msg, filename, lineno); +} + +static node * +parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, + int flags) +{ + parser_state *ps; + node *n; + int started = 0, handling_import = 0, handling_with = 0; + + if ((ps = PyParser_New(g, start)) == NULL) { + fprintf(stderr, "no mem for new parser\n"); + err_ret->error = E_NOMEM; + PyTokenizer_Free(tok); + return NULL; + } +#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD + if (flags & PyPARSE_WITH_IS_KEYWORD) + ps->p_flags |= CO_FUTURE_WITH_STATEMENT; +#endif + + for (;;) { + char *a, *b; + int type; + size_t len; + char *str; + int col_offset; + + type = PyTokenizer_Get(tok, &a, &b); + if (type == ERRORTOKEN) { + err_ret->error = tok->done; + break; + } + if (type == ENDMARKER && started) { + type = NEWLINE; /* Add an extra newline */ + handling_with = handling_import = 0; + started = 0; + /* Add the right number of dedent tokens, + except if a certain flag is given -- + codeop.py uses this. */ + if (tok->indent && + !(flags & PyPARSE_DONT_IMPLY_DEDENT)) + { + tok->pendin = -tok->indent; + tok->indent = 0; + } + } + else + started = 1; + len = b - a; /* XXX this may compute NULL - NULL */ + str = (char *) PyObject_MALLOC(len + 1); + if (str == NULL) { + fprintf(stderr, "no mem for next token\n"); + err_ret->error = E_NOMEM; + break; + } + if (len > 0) + strncpy(str, a, len); + str[len] = '\0'; + +#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD + /* This is only necessary to support the "as" warning, but + we don't want to warn about "as" in import statements. */ + if (type == NAME && + len == 6 && str[0] == 'i' && strcmp(str, "import") == 0) + handling_import = 1; + + /* Warn about with as NAME */ + if (type == NAME && + !(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) { + if (len == 4 && str[0] == 'w' && strcmp(str, "with") == 0) + warn(with_msg, err_ret->filename, tok->lineno); + else if (!(handling_import || handling_with) && + len == 2 && str[0] == 'a' && + strcmp(str, "as") == 0) + warn(as_msg, err_ret->filename, tok->lineno); + } + else if (type == NAME && + (ps->p_flags & CO_FUTURE_WITH_STATEMENT) && + len == 4 && str[0] == 'w' && strcmp(str, "with") == 0) + handling_with = 1; +#endif + if (a >= tok->line_start) + col_offset = a - tok->line_start; + else + col_offset = -1; + + if ((err_ret->error = + PyParser_AddToken(ps, (int)type, str, tok->lineno, col_offset, + &(err_ret->expected))) != E_OK) { + if (err_ret->error != E_DONE) { + PyObject_FREE(str); + err_ret->token = type; + } + break; + } + } + + if (err_ret->error == E_DONE) { + n = ps->p_tree; + ps->p_tree = NULL; + } + else + n = NULL; + + PyParser_Delete(ps); + + if (n == NULL) { + if (tok->lineno <= 1 && tok->done == E_EOF) + err_ret->error = E_EOF; + err_ret->lineno = tok->lineno; + if (tok->buf != NULL) { + size_t len; + assert(tok->cur - tok->buf < INT_MAX); + err_ret->offset = (int)(tok->cur - tok->buf); + len = tok->inp - tok->buf; + err_ret->text = (char *) PyObject_MALLOC(len + 1); + if (err_ret->text != NULL) { + if (len > 0) + strncpy(err_ret->text, tok->buf, len); + err_ret->text[len] = '\0'; + } + } + } else if (tok->encoding != NULL) { + node* r = PyNode_New(encoding_decl); + if (!r) { + err_ret->error = E_NOMEM; + n = NULL; + goto done; + } + r->n_str = tok->encoding; + r->n_nchildren = 1; + r->n_child = n; + tok->encoding = NULL; + n = r; + } + +done: + PyTokenizer_Free(tok); + + return n; +} + +static void +initerr(perrdetail *err_ret, const char *filename) +{ + err_ret->error = E_OK; + err_ret->filename = filename; + err_ret->lineno = 0; + err_ret->offset = 0; + err_ret->text = NULL; + err_ret->token = -1; + err_ret->expected = -1; +} diff --git a/sys/src/cmd/python/Parser/pgen.c b/sys/src/cmd/python/Parser/pgen.c new file mode 100644 index 000000000..dfe7cacba --- /dev/null +++ b/sys/src/cmd/python/Parser/pgen.c @@ -0,0 +1,706 @@ +/* Parser generator */ + +/* For a description, see the comments at end of this file */ + +#include "Python.h" +#include "pgenheaders.h" +#include "token.h" +#include "node.h" +#include "grammar.h" +#include "metagrammar.h" +#include "pgen.h" + +extern int Py_DebugFlag; +extern int Py_IgnoreEnvironmentFlag; /* needed by Py_GETENV */ + + +/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */ + +typedef struct _nfaarc { + int ar_label; + int ar_arrow; +} nfaarc; + +typedef struct _nfastate { + int st_narcs; + nfaarc *st_arc; +} nfastate; + +typedef struct _nfa { + int nf_type; + char *nf_name; + int nf_nstates; + nfastate *nf_state; + int nf_start, nf_finish; +} nfa; + +/* Forward */ +static void compile_rhs(labellist *ll, + nfa *nf, node *n, int *pa, int *pb); +static void compile_alt(labellist *ll, + nfa *nf, node *n, int *pa, int *pb); +static void compile_item(labellist *ll, + nfa *nf, node *n, int *pa, int *pb); +static void compile_atom(labellist *ll, + nfa *nf, node *n, int *pa, int *pb); + +static int +addnfastate(nfa *nf) +{ + nfastate *st; + + nf->nf_state = (nfastate *)PyObject_REALLOC(nf->nf_state, + sizeof(nfastate) * (nf->nf_nstates + 1)); + if (nf->nf_state == NULL) + Py_FatalError("out of mem"); + st = &nf->nf_state[nf->nf_nstates++]; + st->st_narcs = 0; + st->st_arc = NULL; + return st - nf->nf_state; +} + +static void +addnfaarc(nfa *nf, int from, int to, int lbl) +{ + nfastate *st; + nfaarc *ar; + + st = &nf->nf_state[from]; + st->st_arc = (nfaarc *)PyObject_REALLOC(st->st_arc, + sizeof(nfaarc) * (st->st_narcs + 1)); + if (st->st_arc == NULL) + Py_FatalError("out of mem"); + ar = &st->st_arc[st->st_narcs++]; + ar->ar_label = lbl; + ar->ar_arrow = to; +} + +static nfa * +newnfa(char *name) +{ + nfa *nf; + static int type = NT_OFFSET; /* All types will be disjunct */ + + nf = (nfa *)PyObject_MALLOC(sizeof(nfa)); + if (nf == NULL) + Py_FatalError("no mem for new nfa"); + nf->nf_type = type++; + nf->nf_name = name; /* XXX strdup(name) ??? */ + nf->nf_nstates = 0; + nf->nf_state = NULL; + nf->nf_start = nf->nf_finish = -1; + return nf; +} + +typedef struct _nfagrammar { + int gr_nnfas; + nfa **gr_nfa; + labellist gr_ll; +} nfagrammar; + +/* Forward */ +static void compile_rule(nfagrammar *gr, node *n); + +static nfagrammar * +newnfagrammar(void) +{ + nfagrammar *gr; + + gr = (nfagrammar *)PyObject_MALLOC(sizeof(nfagrammar)); + if (gr == NULL) + Py_FatalError("no mem for new nfa grammar"); + gr->gr_nnfas = 0; + gr->gr_nfa = NULL; + gr->gr_ll.ll_nlabels = 0; + gr->gr_ll.ll_label = NULL; + addlabel(&gr->gr_ll, ENDMARKER, "EMPTY"); + return gr; +} + +static nfa * +addnfa(nfagrammar *gr, char *name) +{ + nfa *nf; + + nf = newnfa(name); + gr->gr_nfa = (nfa **)PyObject_REALLOC(gr->gr_nfa, + sizeof(nfa) * (gr->gr_nnfas + 1)); + if (gr->gr_nfa == NULL) + Py_FatalError("out of mem"); + gr->gr_nfa[gr->gr_nnfas++] = nf; + addlabel(&gr->gr_ll, NAME, nf->nf_name); + return nf; +} + +#ifdef Py_DEBUG + +static char REQNFMT[] = "metacompile: less than %d children\n"; + +#define REQN(i, count) \ + if (i < count) { \ + fprintf(stderr, REQNFMT, count); \ + Py_FatalError("REQN"); \ + } else + +#else +#define REQN(i, count) /* empty */ +#endif + +static nfagrammar * +metacompile(node *n) +{ + nfagrammar *gr; + int i; + + if (Py_DebugFlag) + printf("Compiling (meta-) parse tree into NFA grammar\n"); + gr = newnfagrammar(); + REQ(n, MSTART); + i = n->n_nchildren - 1; /* Last child is ENDMARKER */ + n = n->n_child; + for (; --i >= 0; n++) { + if (n->n_type != NEWLINE) + compile_rule(gr, n); + } + return gr; +} + +static void +compile_rule(nfagrammar *gr, node *n) +{ + nfa *nf; + + REQ(n, RULE); + REQN(n->n_nchildren, 4); + n = n->n_child; + REQ(n, NAME); + nf = addnfa(gr, n->n_str); + n++; + REQ(n, COLON); + n++; + REQ(n, RHS); + compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish); + n++; + REQ(n, NEWLINE); +} + +static void +compile_rhs(labellist *ll, nfa *nf, node *n, int *pa, int *pb) +{ + int i; + int a, b; + + REQ(n, RHS); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + REQ(n, ALT); + compile_alt(ll, nf, n, pa, pb); + if (--i <= 0) + return; + n++; + a = *pa; + b = *pb; + *pa = addnfastate(nf); + *pb = addnfastate(nf); + addnfaarc(nf, *pa, a, EMPTY); + addnfaarc(nf, b, *pb, EMPTY); + for (; --i >= 0; n++) { + REQ(n, VBAR); + REQN(i, 1); + --i; + n++; + REQ(n, ALT); + compile_alt(ll, nf, n, &a, &b); + addnfaarc(nf, *pa, a, EMPTY); + addnfaarc(nf, b, *pb, EMPTY); + } +} + +static void +compile_alt(labellist *ll, nfa *nf, node *n, int *pa, int *pb) +{ + int i; + int a, b; + + REQ(n, ALT); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + REQ(n, ITEM); + compile_item(ll, nf, n, pa, pb); + --i; + n++; + for (; --i >= 0; n++) { + REQ(n, ITEM); + compile_item(ll, nf, n, &a, &b); + addnfaarc(nf, *pb, a, EMPTY); + *pb = b; + } +} + +static void +compile_item(labellist *ll, nfa *nf, node *n, int *pa, int *pb) +{ + int i; + int a, b; + + REQ(n, ITEM); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + if (n->n_type == LSQB) { + REQN(i, 3); + n++; + REQ(n, RHS); + *pa = addnfastate(nf); + *pb = addnfastate(nf); + addnfaarc(nf, *pa, *pb, EMPTY); + compile_rhs(ll, nf, n, &a, &b); + addnfaarc(nf, *pa, a, EMPTY); + addnfaarc(nf, b, *pb, EMPTY); + REQN(i, 1); + n++; + REQ(n, RSQB); + } + else { + compile_atom(ll, nf, n, pa, pb); + if (--i <= 0) + return; + n++; + addnfaarc(nf, *pb, *pa, EMPTY); + if (n->n_type == STAR) + *pb = *pa; + else + REQ(n, PLUS); + } +} + +static void +compile_atom(labellist *ll, nfa *nf, node *n, int *pa, int *pb) +{ + int i; + + REQ(n, ATOM); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + if (n->n_type == LPAR) { + REQN(i, 3); + n++; + REQ(n, RHS); + compile_rhs(ll, nf, n, pa, pb); + n++; + REQ(n, RPAR); + } + else if (n->n_type == NAME || n->n_type == STRING) { + *pa = addnfastate(nf); + *pb = addnfastate(nf); + addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str)); + } + else + REQ(n, NAME); +} + +static void +dumpstate(labellist *ll, nfa *nf, int istate) +{ + nfastate *st; + int i; + nfaarc *ar; + + printf("%c%2d%c", + istate == nf->nf_start ? '*' : ' ', + istate, + istate == nf->nf_finish ? '.' : ' '); + st = &nf->nf_state[istate]; + ar = st->st_arc; + for (i = 0; i < st->st_narcs; i++) { + if (i > 0) + printf("\n "); + printf("-> %2d %s", ar->ar_arrow, + PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label])); + ar++; + } + printf("\n"); +} + +static void +dumpnfa(labellist *ll, nfa *nf) +{ + int i; + + printf("NFA '%s' has %d states; start %d, finish %d\n", + nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish); + for (i = 0; i < nf->nf_nstates; i++) + dumpstate(ll, nf, i); +} + + +/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */ + +static void +addclosure(bitset ss, nfa *nf, int istate) +{ + if (addbit(ss, istate)) { + nfastate *st = &nf->nf_state[istate]; + nfaarc *ar = st->st_arc; + int i; + + for (i = st->st_narcs; --i >= 0; ) { + if (ar->ar_label == EMPTY) + addclosure(ss, nf, ar->ar_arrow); + ar++; + } + } +} + +typedef struct _ss_arc { + bitset sa_bitset; + int sa_arrow; + int sa_label; +} ss_arc; + +typedef struct _ss_state { + bitset ss_ss; + int ss_narcs; + struct _ss_arc *ss_arc; + int ss_deleted; + int ss_finish; + int ss_rename; +} ss_state; + +typedef struct _ss_dfa { + int sd_nstates; + ss_state *sd_state; +} ss_dfa; + +/* Forward */ +static void printssdfa(int xx_nstates, ss_state *xx_state, int nbits, + labellist *ll, char *msg); +static void simplify(int xx_nstates, ss_state *xx_state); +static void convert(dfa *d, int xx_nstates, ss_state *xx_state); + +static void +makedfa(nfagrammar *gr, nfa *nf, dfa *d) +{ + int nbits = nf->nf_nstates; + bitset ss; + int xx_nstates; + ss_state *xx_state, *yy; + ss_arc *zz; + int istate, jstate, iarc, jarc, ibit; + nfastate *st; + nfaarc *ar; + + ss = newbitset(nbits); + addclosure(ss, nf, nf->nf_start); + xx_state = (ss_state *)PyObject_MALLOC(sizeof(ss_state)); + if (xx_state == NULL) + Py_FatalError("no mem for xx_state in makedfa"); + xx_nstates = 1; + yy = &xx_state[0]; + yy->ss_ss = ss; + yy->ss_narcs = 0; + yy->ss_arc = NULL; + yy->ss_deleted = 0; + yy->ss_finish = testbit(ss, nf->nf_finish); + if (yy->ss_finish) + printf("Error: nonterminal '%s' may produce empty.\n", + nf->nf_name); + + /* This algorithm is from a book written before + the invention of structured programming... */ + + /* For each unmarked state... */ + for (istate = 0; istate < xx_nstates; ++istate) { + size_t size; + yy = &xx_state[istate]; + ss = yy->ss_ss; + /* For all its states... */ + for (ibit = 0; ibit < nf->nf_nstates; ++ibit) { + if (!testbit(ss, ibit)) + continue; + st = &nf->nf_state[ibit]; + /* For all non-empty arcs from this state... */ + for (iarc = 0; iarc < st->st_narcs; iarc++) { + ar = &st->st_arc[iarc]; + if (ar->ar_label == EMPTY) + continue; + /* Look up in list of arcs from this state */ + for (jarc = 0; jarc < yy->ss_narcs; ++jarc) { + zz = &yy->ss_arc[jarc]; + if (ar->ar_label == zz->sa_label) + goto found; + } + /* Add new arc for this state */ + size = sizeof(ss_arc) * (yy->ss_narcs + 1); + yy->ss_arc = (ss_arc *)PyObject_REALLOC( + yy->ss_arc, size); + if (yy->ss_arc == NULL) + Py_FatalError("out of mem"); + zz = &yy->ss_arc[yy->ss_narcs++]; + zz->sa_label = ar->ar_label; + zz->sa_bitset = newbitset(nbits); + zz->sa_arrow = -1; + found: ; + /* Add destination */ + addclosure(zz->sa_bitset, nf, ar->ar_arrow); + } + } + /* Now look up all the arrow states */ + for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) { + zz = &xx_state[istate].ss_arc[jarc]; + for (jstate = 0; jstate < xx_nstates; jstate++) { + if (samebitset(zz->sa_bitset, + xx_state[jstate].ss_ss, nbits)) { + zz->sa_arrow = jstate; + goto done; + } + } + size = sizeof(ss_state) * (xx_nstates + 1); + xx_state = (ss_state *)PyObject_REALLOC(xx_state, + size); + if (xx_state == NULL) + Py_FatalError("out of mem"); + zz->sa_arrow = xx_nstates; + yy = &xx_state[xx_nstates++]; + yy->ss_ss = zz->sa_bitset; + yy->ss_narcs = 0; + yy->ss_arc = NULL; + yy->ss_deleted = 0; + yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish); + done: ; + } + } + + if (Py_DebugFlag) + printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, + "before minimizing"); + + simplify(xx_nstates, xx_state); + + if (Py_DebugFlag) + printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, + "after minimizing"); + + convert(d, xx_nstates, xx_state); + + /* XXX cleanup */ +} + +static void +printssdfa(int xx_nstates, ss_state *xx_state, int nbits, + labellist *ll, char *msg) +{ + int i, ibit, iarc; + ss_state *yy; + ss_arc *zz; + + printf("Subset DFA %s\n", msg); + for (i = 0; i < xx_nstates; i++) { + yy = &xx_state[i]; + if (yy->ss_deleted) + continue; + printf(" Subset %d", i); + if (yy->ss_finish) + printf(" (finish)"); + printf(" { "); + for (ibit = 0; ibit < nbits; ibit++) { + if (testbit(yy->ss_ss, ibit)) + printf("%d ", ibit); + } + printf("}\n"); + for (iarc = 0; iarc < yy->ss_narcs; iarc++) { + zz = &yy->ss_arc[iarc]; + printf(" Arc to state %d, label %s\n", + zz->sa_arrow, + PyGrammar_LabelRepr( + &ll->ll_label[zz->sa_label])); + } + } +} + + +/* PART THREE -- SIMPLIFY DFA */ + +/* Simplify the DFA by repeatedly eliminating states that are + equivalent to another oner. This is NOT Algorithm 3.3 from + [Aho&Ullman 77]. It does not always finds the minimal DFA, + but it does usually make a much smaller one... (For an example + of sub-optimal behavior, try S: x a b+ | y a b+.) +*/ + +static int +samestate(ss_state *s1, ss_state *s2) +{ + int i; + + if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish) + return 0; + for (i = 0; i < s1->ss_narcs; i++) { + if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow || + s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label) + return 0; + } + return 1; +} + +static void +renamestates(int xx_nstates, ss_state *xx_state, int from, int to) +{ + int i, j; + + if (Py_DebugFlag) + printf("Rename state %d to %d.\n", from, to); + for (i = 0; i < xx_nstates; i++) { + if (xx_state[i].ss_deleted) + continue; + for (j = 0; j < xx_state[i].ss_narcs; j++) { + if (xx_state[i].ss_arc[j].sa_arrow == from) + xx_state[i].ss_arc[j].sa_arrow = to; + } + } +} + +static void +simplify(int xx_nstates, ss_state *xx_state) +{ + int changes; + int i, j; + + do { + changes = 0; + for (i = 1; i < xx_nstates; i++) { + if (xx_state[i].ss_deleted) + continue; + for (j = 0; j < i; j++) { + if (xx_state[j].ss_deleted) + continue; + if (samestate(&xx_state[i], &xx_state[j])) { + xx_state[i].ss_deleted++; + renamestates(xx_nstates, xx_state, + i, j); + changes++; + break; + } + } + } + } while (changes); +} + + +/* PART FOUR -- GENERATE PARSING TABLES */ + +/* Convert the DFA into a grammar that can be used by our parser */ + +static void +convert(dfa *d, int xx_nstates, ss_state *xx_state) +{ + int i, j; + ss_state *yy; + ss_arc *zz; + + for (i = 0; i < xx_nstates; i++) { + yy = &xx_state[i]; + if (yy->ss_deleted) + continue; + yy->ss_rename = addstate(d); + } + + for (i = 0; i < xx_nstates; i++) { + yy = &xx_state[i]; + if (yy->ss_deleted) + continue; + for (j = 0; j < yy->ss_narcs; j++) { + zz = &yy->ss_arc[j]; + addarc(d, yy->ss_rename, + xx_state[zz->sa_arrow].ss_rename, + zz->sa_label); + } + if (yy->ss_finish) + addarc(d, yy->ss_rename, yy->ss_rename, 0); + } + + d->d_initial = 0; +} + + +/* PART FIVE -- GLUE IT ALL TOGETHER */ + +static grammar * +maketables(nfagrammar *gr) +{ + int i; + nfa *nf; + dfa *d; + grammar *g; + + if (gr->gr_nnfas == 0) + return NULL; + g = newgrammar(gr->gr_nfa[0]->nf_type); + /* XXX first rule must be start rule */ + g->g_ll = gr->gr_ll; + + for (i = 0; i < gr->gr_nnfas; i++) { + nf = gr->gr_nfa[i]; + if (Py_DebugFlag) { + printf("Dump of NFA for '%s' ...\n", nf->nf_name); + dumpnfa(&gr->gr_ll, nf); + printf("Making DFA for '%s' ...\n", nf->nf_name); + } + d = adddfa(g, nf->nf_type, nf->nf_name); + makedfa(gr, gr->gr_nfa[i], d); + } + + return g; +} + +grammar * +pgen(node *n) +{ + nfagrammar *gr; + grammar *g; + + gr = metacompile(n); + g = maketables(gr); + translatelabels(g); + addfirstsets(g); + return g; +} + +grammar * +Py_pgen(node *n) +{ + return pgen(n); +} + +/* + +Description +----------- + +Input is a grammar in extended BNF (using * for repetition, + for +at-least-once repetition, [] for optional parts, | for alternatives and +() for grouping). This has already been parsed and turned into a parse +tree. + +Each rule is considered as a regular expression in its own right. +It is turned into a Non-deterministic Finite Automaton (NFA), which +is then turned into a Deterministic Finite Automaton (DFA), which is then +optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, +or similar compiler books (this technique is more often used for lexical +analyzers). + +The DFA's are used by the parser as parsing tables in a special way +that's probably unique. Before they are usable, the FIRST sets of all +non-terminals are computed. + +Reference +--------- + +[Aho&Ullman 77] + Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 + (first edition) + +*/ diff --git a/sys/src/cmd/python/Parser/pgenmain.c b/sys/src/cmd/python/Parser/pgenmain.c new file mode 100644 index 000000000..fc27a2cd1 --- /dev/null +++ b/sys/src/cmd/python/Parser/pgenmain.c @@ -0,0 +1,173 @@ + +/* Parser generator main program */ + +/* This expects a filename containing the grammar as argv[1] (UNIX) + or asks the console for such a file name (THINK C). + It writes its output on two files in the current directory: + - "graminit.c" gets the grammar as a bunch of initialized data + - "graminit.h" gets the grammar's non-terminals as #defines. + Error messages and status info during the generation process are + written to stdout, or sometimes to stderr. */ + +/* XXX TO DO: + - check for duplicate definitions of names (instead of fatal err) +*/ + +#include "Python.h" +#include "pgenheaders.h" +#include "grammar.h" +#include "node.h" +#include "parsetok.h" +#include "pgen.h" + +int Py_DebugFlag; +int Py_VerboseFlag; +int Py_IgnoreEnvironmentFlag; + +/* Forward */ +grammar *getgrammar(char *filename); + +void +Py_Exit(int sts) +{ + exit(sts); +} + +int +main(int argc, char **argv) +{ + grammar *g; + FILE *fp; + char *filename, *graminit_h, *graminit_c; + + if (argc != 4) { + fprintf(stderr, + "usage: %s grammar graminit.h graminit.c\n", argv[0]); + Py_Exit(2); + } + filename = argv[1]; + graminit_h = argv[2]; + graminit_c = argv[3]; + g = getgrammar(filename); + fp = fopen(graminit_c, "w"); + if (fp == NULL) { + perror(graminit_c); + Py_Exit(1); + } + if (Py_DebugFlag) + printf("Writing %s ...\n", graminit_c); + printgrammar(g, fp); + fclose(fp); + fp = fopen(graminit_h, "w"); + if (fp == NULL) { + perror(graminit_h); + Py_Exit(1); + } + if (Py_DebugFlag) + printf("Writing %s ...\n", graminit_h); + printnonterminals(g, fp); + fclose(fp); + Py_Exit(0); + return 0; /* Make gcc -Wall happy */ +} + +grammar * +getgrammar(char *filename) +{ + FILE *fp; + node *n; + grammar *g0, *g; + perrdetail err; + + fp = fopen(filename, "r"); + if (fp == NULL) { + perror(filename); + Py_Exit(1); + } + g0 = meta_grammar(); + n = PyParser_ParseFile(fp, filename, g0, g0->g_start, + (char *)NULL, (char *)NULL, &err); + fclose(fp); + if (n == NULL) { + fprintf(stderr, "Parsing error %d, line %d.\n", + err.error, err.lineno); + if (err.text != NULL) { + size_t i; + fprintf(stderr, "%s", err.text); + i = strlen(err.text); + if (i == 0 || err.text[i-1] != '\n') + fprintf(stderr, "\n"); + for (i = 0; i < err.offset; i++) { + if (err.text[i] == '\t') + putc('\t', stderr); + else + putc(' ', stderr); + } + fprintf(stderr, "^\n"); + PyObject_FREE(err.text); + } + Py_Exit(1); + } + g = pgen(n); + if (g == NULL) { + printf("Bad grammar.\n"); + Py_Exit(1); + } + return g; +} + +/* Can't happen in pgen */ +PyObject* +PyErr_Occurred() +{ + return 0; +} + +void +Py_FatalError(const char *msg) +{ + fprintf(stderr, "pgen: FATAL ERROR: %s\n", msg); + Py_Exit(1); +} + +/* No-nonsense my_readline() for tokenizer.c */ + +char * +PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, char *prompt) +{ + size_t n = 1000; + char *p = (char *)PyMem_MALLOC(n); + char *q; + if (p == NULL) + return NULL; + fprintf(stderr, "%s", prompt); + q = fgets(p, n, sys_stdin); + if (q == NULL) { + *p = '\0'; + return p; + } + n = strlen(p); + if (n > 0 && p[n-1] != '\n') + p[n-1] = '\n'; + return (char *)PyMem_REALLOC(p, n+1); +} + +/* No-nonsense fgets */ +char * +Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj) +{ + return fgets(buf, n, stream); +} + + +#include <stdarg.h> + +void +PySys_WriteStderr(const char *format, ...) +{ + va_list va; + + va_start(va, format); + vfprintf(stderr, format, va); + va_end(va); +} diff --git a/sys/src/cmd/python/Parser/printgrammar.c b/sys/src/cmd/python/Parser/printgrammar.c new file mode 100644 index 000000000..554069814 --- /dev/null +++ b/sys/src/cmd/python/Parser/printgrammar.c @@ -0,0 +1,113 @@ + +/* Print a bunch of C initializers that represent a grammar */ + +#include "pgenheaders.h" +#include "grammar.h" + +/* Forward */ +static void printarcs(int, dfa *, FILE *); +static void printstates(grammar *, FILE *); +static void printdfas(grammar *, FILE *); +static void printlabels(grammar *, FILE *); + +void +printgrammar(grammar *g, FILE *fp) +{ + fprintf(fp, "#include \"pgenheaders.h\"\n"); + fprintf(fp, "#include \"grammar.h\"\n"); + printdfas(g, fp); + printlabels(g, fp); + fprintf(fp, "grammar _PyParser_Grammar = {\n"); + fprintf(fp, "\t%d,\n", g->g_ndfas); + fprintf(fp, "\tdfas,\n"); + fprintf(fp, "\t{%d, labels},\n", g->g_ll.ll_nlabels); + fprintf(fp, "\t%d\n", g->g_start); + fprintf(fp, "};\n"); +} + +void +printnonterminals(grammar *g, FILE *fp) +{ + dfa *d; + int i; + + d = g->g_dfa; + for (i = g->g_ndfas; --i >= 0; d++) + fprintf(fp, "#define %s %d\n", d->d_name, d->d_type); +} + +static void +printarcs(int i, dfa *d, FILE *fp) +{ + arc *a; + state *s; + int j, k; + + s = d->d_state; + for (j = 0; j < d->d_nstates; j++, s++) { + fprintf(fp, "static arc arcs_%d_%d[%d] = {\n", + i, j, s->s_narcs); + a = s->s_arc; + for (k = 0; k < s->s_narcs; k++, a++) + fprintf(fp, "\t{%d, %d},\n", a->a_lbl, a->a_arrow); + fprintf(fp, "};\n"); + } +} + +static void +printstates(grammar *g, FILE *fp) +{ + state *s; + dfa *d; + int i, j; + + d = g->g_dfa; + for (i = 0; i < g->g_ndfas; i++, d++) { + printarcs(i, d, fp); + fprintf(fp, "static state states_%d[%d] = {\n", + i, d->d_nstates); + s = d->d_state; + for (j = 0; j < d->d_nstates; j++, s++) + fprintf(fp, "\t{%d, arcs_%d_%d},\n", + s->s_narcs, i, j); + fprintf(fp, "};\n"); + } +} + +static void +printdfas(grammar *g, FILE *fp) +{ + dfa *d; + int i, j; + + printstates(g, fp); + fprintf(fp, "static dfa dfas[%d] = {\n", g->g_ndfas); + d = g->g_dfa; + for (i = 0; i < g->g_ndfas; i++, d++) { + fprintf(fp, "\t{%d, \"%s\", %d, %d, states_%d,\n", + d->d_type, d->d_name, d->d_initial, d->d_nstates, i); + fprintf(fp, "\t \""); + for (j = 0; j < NBYTES(g->g_ll.ll_nlabels); j++) + fprintf(fp, "\\%03o", d->d_first[j] & 0xff); + fprintf(fp, "\"},\n"); + } + fprintf(fp, "};\n"); +} + +static void +printlabels(grammar *g, FILE *fp) +{ + label *l; + int i; + + fprintf(fp, "static label labels[%d] = {\n", g->g_ll.ll_nlabels); + l = g->g_ll.ll_label; + for (i = g->g_ll.ll_nlabels; --i >= 0; l++) { + if (l->lb_str == NULL) + fprintf(fp, "\t{%d, 0},\n", l->lb_type); + else + fprintf(fp, "\t{%d, \"%s\"},\n", + l->lb_type, l->lb_str); + } + fprintf(fp, "};\n"); +} diff --git a/sys/src/cmd/python/Parser/spark.py b/sys/src/cmd/python/Parser/spark.py new file mode 100644 index 000000000..2c18623a0 --- /dev/null +++ b/sys/src/cmd/python/Parser/spark.py @@ -0,0 +1,840 @@ +# Copyright (c) 1998-2002 John Aycock +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__version__ = 'SPARK-0.7 (pre-alpha-5)' + +import re +import sys +import string + +def _namelist(instance): + namelist, namedict, classlist = [], {}, [instance.__class__] + for c in classlist: + for b in c.__bases__: + classlist.append(b) + for name in c.__dict__.keys(): + if not namedict.has_key(name): + namelist.append(name) + namedict[name] = 1 + return namelist + +class GenericScanner: + def __init__(self, flags=0): + pattern = self.reflect() + self.re = re.compile(pattern, re.VERBOSE|flags) + + self.index2func = {} + for name, number in self.re.groupindex.items(): + self.index2func[number-1] = getattr(self, 't_' + name) + + def makeRE(self, name): + doc = getattr(self, name).__doc__ + rv = '(?P<%s>%s)' % (name[2:], doc) + return rv + + def reflect(self): + rv = [] + for name in _namelist(self): + if name[:2] == 't_' and name != 't_default': + rv.append(self.makeRE(name)) + + rv.append(self.makeRE('t_default')) + return string.join(rv, '|') + + def error(self, s, pos): + print "Lexical error at position %s" % pos + raise SystemExit + + def tokenize(self, s): + pos = 0 + n = len(s) + while pos < n: + m = self.re.match(s, pos) + if m is None: + self.error(s, pos) + + groups = m.groups() + for i in range(len(groups)): + if groups[i] and self.index2func.has_key(i): + self.index2func[i](groups[i]) + pos = m.end() + + def t_default(self, s): + r'( . | \n )+' + print "Specification error: unmatched input" + raise SystemExit + +# +# Extracted from GenericParser and made global so that [un]picking works. +# +class _State: + def __init__(self, stateno, items): + self.T, self.complete, self.items = [], [], items + self.stateno = stateno + +class GenericParser: + # + # An Earley parser, as per J. Earley, "An Efficient Context-Free + # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, + # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, + # Carnegie-Mellon University, August 1968. New formulation of + # the parser according to J. Aycock, "Practical Earley Parsing + # and the SPARK Toolkit", Ph.D. thesis, University of Victoria, + # 2001, and J. Aycock and R. N. Horspool, "Practical Earley + # Parsing", unpublished paper, 2001. + # + + def __init__(self, start): + self.rules = {} + self.rule2func = {} + self.rule2name = {} + self.collectRules() + self.augment(start) + self.ruleschanged = 1 + + _NULLABLE = '\e_' + _START = 'START' + _BOF = '|-' + + # + # When pickling, take the time to generate the full state machine; + # some information is then extraneous, too. Unfortunately we + # can't save the rule2func map. + # + def __getstate__(self): + if self.ruleschanged: + # + # XXX - duplicated from parse() + # + self.computeNull() + self.newrules = {} + self.new2old = {} + self.makeNewRules() + self.ruleschanged = 0 + self.edges, self.cores = {}, {} + self.states = { 0: self.makeState0() } + self.makeState(0, self._BOF) + # + # XXX - should find a better way to do this.. + # + changes = 1 + while changes: + changes = 0 + for k, v in self.edges.items(): + if v is None: + state, sym = k + if self.states.has_key(state): + self.goto(state, sym) + changes = 1 + rv = self.__dict__.copy() + for s in self.states.values(): + del s.items + del rv['rule2func'] + del rv['nullable'] + del rv['cores'] + return rv + + def __setstate__(self, D): + self.rules = {} + self.rule2func = {} + self.rule2name = {} + self.collectRules() + start = D['rules'][self._START][0][1][1] # Blech. + self.augment(start) + D['rule2func'] = self.rule2func + D['makeSet'] = self.makeSet_fast + self.__dict__ = D + + # + # A hook for GenericASTBuilder and GenericASTMatcher. Mess + # thee not with this; nor shall thee toucheth the _preprocess + # argument to addRule. + # + def preprocess(self, rule, func): return rule, func + + def addRule(self, doc, func, _preprocess=1): + fn = func + rules = string.split(doc) + + index = [] + for i in range(len(rules)): + if rules[i] == '::=': + index.append(i-1) + index.append(len(rules)) + + for i in range(len(index)-1): + lhs = rules[index[i]] + rhs = rules[index[i]+2:index[i+1]] + rule = (lhs, tuple(rhs)) + + if _preprocess: + rule, fn = self.preprocess(rule, func) + + if self.rules.has_key(lhs): + self.rules[lhs].append(rule) + else: + self.rules[lhs] = [ rule ] + self.rule2func[rule] = fn + self.rule2name[rule] = func.__name__[2:] + self.ruleschanged = 1 + + def collectRules(self): + for name in _namelist(self): + if name[:2] == 'p_': + func = getattr(self, name) + doc = func.__doc__ + self.addRule(doc, func) + + def augment(self, start): + rule = '%s ::= %s %s' % (self._START, self._BOF, start) + self.addRule(rule, lambda args: args[1], 0) + + def computeNull(self): + self.nullable = {} + tbd = [] + + for rulelist in self.rules.values(): + lhs = rulelist[0][0] + self.nullable[lhs] = 0 + for rule in rulelist: + rhs = rule[1] + if len(rhs) == 0: + self.nullable[lhs] = 1 + continue + # + # We only need to consider rules which + # consist entirely of nonterminal symbols. + # This should be a savings on typical + # grammars. + # + for sym in rhs: + if not self.rules.has_key(sym): + break + else: + tbd.append(rule) + changes = 1 + while changes: + changes = 0 + for lhs, rhs in tbd: + if self.nullable[lhs]: + continue + for sym in rhs: + if not self.nullable[sym]: + break + else: + self.nullable[lhs] = 1 + changes = 1 + + def makeState0(self): + s0 = _State(0, []) + for rule in self.newrules[self._START]: + s0.items.append((rule, 0)) + return s0 + + def finalState(self, tokens): + # + # Yuck. + # + if len(self.newrules[self._START]) == 2 and len(tokens) == 0: + return 1 + start = self.rules[self._START][0][1][1] + return self.goto(1, start) + + def makeNewRules(self): + worklist = [] + for rulelist in self.rules.values(): + for rule in rulelist: + worklist.append((rule, 0, 1, rule)) + + for rule, i, candidate, oldrule in worklist: + lhs, rhs = rule + n = len(rhs) + while i < n: + sym = rhs[i] + if not self.rules.has_key(sym) or \ + not self.nullable[sym]: + candidate = 0 + i = i + 1 + continue + + newrhs = list(rhs) + newrhs[i] = self._NULLABLE+sym + newrule = (lhs, tuple(newrhs)) + worklist.append((newrule, i+1, + candidate, oldrule)) + candidate = 0 + i = i + 1 + else: + if candidate: + lhs = self._NULLABLE+lhs + rule = (lhs, rhs) + if self.newrules.has_key(lhs): + self.newrules[lhs].append(rule) + else: + self.newrules[lhs] = [ rule ] + self.new2old[rule] = oldrule + + def typestring(self, token): + return None + + def error(self, token): + print "Syntax error at or near `%s' token" % token + raise SystemExit + + def parse(self, tokens): + sets = [ [(1,0), (2,0)] ] + self.links = {} + + if self.ruleschanged: + self.computeNull() + self.newrules = {} + self.new2old = {} + self.makeNewRules() + self.ruleschanged = 0 + self.edges, self.cores = {}, {} + self.states = { 0: self.makeState0() } + self.makeState(0, self._BOF) + + for i in xrange(len(tokens)): + sets.append([]) + + if sets[i] == []: + break + self.makeSet(tokens[i], sets, i) + else: + sets.append([]) + self.makeSet(None, sets, len(tokens)) + + #_dump(tokens, sets, self.states) + + finalitem = (self.finalState(tokens), 0) + if finalitem not in sets[-2]: + if len(tokens) > 0: + self.error(tokens[i-1]) + else: + self.error(None) + + return self.buildTree(self._START, finalitem, + tokens, len(sets)-2) + + def isnullable(self, sym): + # + # For symbols in G_e only. If we weren't supporting 1.5, + # could just use sym.startswith(). + # + return self._NULLABLE == sym[0:len(self._NULLABLE)] + + def skip(self, (lhs, rhs), pos=0): + n = len(rhs) + while pos < n: + if not self.isnullable(rhs[pos]): + break + pos = pos + 1 + return pos + + def makeState(self, state, sym): + assert sym is not None + # + # Compute \epsilon-kernel state's core and see if + # it exists already. + # + kitems = [] + for rule, pos in self.states[state].items: + lhs, rhs = rule + if rhs[pos:pos+1] == (sym,): + kitems.append((rule, self.skip(rule, pos+1))) + core = kitems + + core.sort() + tcore = tuple(core) + if self.cores.has_key(tcore): + return self.cores[tcore] + # + # Nope, doesn't exist. Compute it and the associated + # \epsilon-nonkernel state together; we'll need it right away. + # + k = self.cores[tcore] = len(self.states) + K, NK = _State(k, kitems), _State(k+1, []) + self.states[k] = K + predicted = {} + + edges = self.edges + rules = self.newrules + for X in K, NK: + worklist = X.items + for item in worklist: + rule, pos = item + lhs, rhs = rule + if pos == len(rhs): + X.complete.append(rule) + continue + + nextSym = rhs[pos] + key = (X.stateno, nextSym) + if not rules.has_key(nextSym): + if not edges.has_key(key): + edges[key] = None + X.T.append(nextSym) + else: + edges[key] = None + if not predicted.has_key(nextSym): + predicted[nextSym] = 1 + for prule in rules[nextSym]: + ppos = self.skip(prule) + new = (prule, ppos) + NK.items.append(new) + # + # Problem: we know K needs generating, but we + # don't yet know about NK. Can't commit anything + # regarding NK to self.edges until we're sure. Should + # we delay committing on both K and NK to avoid this + # hacky code? This creates other problems.. + # + if X is K: + edges = {} + + if NK.items == []: + return k + + # + # Check for \epsilon-nonkernel's core. Unfortunately we + # need to know the entire set of predicted nonterminals + # to do this without accidentally duplicating states. + # + core = predicted.keys() + core.sort() + tcore = tuple(core) + if self.cores.has_key(tcore): + self.edges[(k, None)] = self.cores[tcore] + return k + + nk = self.cores[tcore] = self.edges[(k, None)] = NK.stateno + self.edges.update(edges) + self.states[nk] = NK + return k + + def goto(self, state, sym): + key = (state, sym) + if not self.edges.has_key(key): + # + # No transitions from state on sym. + # + return None + + rv = self.edges[key] + if rv is None: + # + # Target state isn't generated yet. Remedy this. + # + rv = self.makeState(state, sym) + self.edges[key] = rv + return rv + + def gotoT(self, state, t): + return [self.goto(state, t)] + + def gotoST(self, state, st): + rv = [] + for t in self.states[state].T: + if st == t: + rv.append(self.goto(state, t)) + return rv + + def add(self, set, item, i=None, predecessor=None, causal=None): + if predecessor is None: + if item not in set: + set.append(item) + else: + key = (item, i) + if item not in set: + self.links[key] = [] + set.append(item) + self.links[key].append((predecessor, causal)) + + def makeSet(self, token, sets, i): + cur, next = sets[i], sets[i+1] + + ttype = token is not None and self.typestring(token) or None + if ttype is not None: + fn, arg = self.gotoT, ttype + else: + fn, arg = self.gotoST, token + + for item in cur: + ptr = (item, i) + state, parent = item + add = fn(state, arg) + for k in add: + if k is not None: + self.add(next, (k, parent), i+1, ptr) + nk = self.goto(k, None) + if nk is not None: + self.add(next, (nk, i+1)) + + if parent == i: + continue + + for rule in self.states[state].complete: + lhs, rhs = rule + for pitem in sets[parent]: + pstate, pparent = pitem + k = self.goto(pstate, lhs) + if k is not None: + why = (item, i, rule) + pptr = (pitem, parent) + self.add(cur, (k, pparent), + i, pptr, why) + nk = self.goto(k, None) + if nk is not None: + self.add(cur, (nk, i)) + + def makeSet_fast(self, token, sets, i): + # + # Call *only* when the entire state machine has been built! + # It relies on self.edges being filled in completely, and + # then duplicates and inlines code to boost speed at the + # cost of extreme ugliness. + # + cur, next = sets[i], sets[i+1] + ttype = token is not None and self.typestring(token) or None + + for item in cur: + ptr = (item, i) + state, parent = item + if ttype is not None: + k = self.edges.get((state, ttype), None) + if k is not None: + #self.add(next, (k, parent), i+1, ptr) + #INLINED --v + new = (k, parent) + key = (new, i+1) + if new not in next: + self.links[key] = [] + next.append(new) + self.links[key].append((ptr, None)) + #INLINED --^ + #nk = self.goto(k, None) + nk = self.edges.get((k, None), None) + if nk is not None: + #self.add(next, (nk, i+1)) + #INLINED --v + new = (nk, i+1) + if new not in next: + next.append(new) + #INLINED --^ + else: + add = self.gotoST(state, token) + for k in add: + if k is not None: + self.add(next, (k, parent), i+1, ptr) + #nk = self.goto(k, None) + nk = self.edges.get((k, None), None) + if nk is not None: + self.add(next, (nk, i+1)) + + if parent == i: + continue + + for rule in self.states[state].complete: + lhs, rhs = rule + for pitem in sets[parent]: + pstate, pparent = pitem + #k = self.goto(pstate, lhs) + k = self.edges.get((pstate, lhs), None) + if k is not None: + why = (item, i, rule) + pptr = (pitem, parent) + #self.add(cur, (k, pparent), + # i, pptr, why) + #INLINED --v + new = (k, pparent) + key = (new, i) + if new not in cur: + self.links[key] = [] + cur.append(new) + self.links[key].append((pptr, why)) + #INLINED --^ + #nk = self.goto(k, None) + nk = self.edges.get((k, None), None) + if nk is not None: + #self.add(cur, (nk, i)) + #INLINED --v + new = (nk, i) + if new not in cur: + cur.append(new) + #INLINED --^ + + def predecessor(self, key, causal): + for p, c in self.links[key]: + if c == causal: + return p + assert 0 + + def causal(self, key): + links = self.links[key] + if len(links) == 1: + return links[0][1] + choices = [] + rule2cause = {} + for p, c in links: + rule = c[2] + choices.append(rule) + rule2cause[rule] = c + return rule2cause[self.ambiguity(choices)] + + def deriveEpsilon(self, nt): + if len(self.newrules[nt]) > 1: + rule = self.ambiguity(self.newrules[nt]) + else: + rule = self.newrules[nt][0] + #print rule + + rhs = rule[1] + attr = [None] * len(rhs) + + for i in range(len(rhs)-1, -1, -1): + attr[i] = self.deriveEpsilon(rhs[i]) + return self.rule2func[self.new2old[rule]](attr) + + def buildTree(self, nt, item, tokens, k): + state, parent = item + + choices = [] + for rule in self.states[state].complete: + if rule[0] == nt: + choices.append(rule) + rule = choices[0] + if len(choices) > 1: + rule = self.ambiguity(choices) + #print rule + + rhs = rule[1] + attr = [None] * len(rhs) + + for i in range(len(rhs)-1, -1, -1): + sym = rhs[i] + if not self.newrules.has_key(sym): + if sym != self._BOF: + attr[i] = tokens[k-1] + key = (item, k) + item, k = self.predecessor(key, None) + #elif self.isnullable(sym): + elif self._NULLABLE == sym[0:len(self._NULLABLE)]: + attr[i] = self.deriveEpsilon(sym) + else: + key = (item, k) + why = self.causal(key) + attr[i] = self.buildTree(sym, why[0], + tokens, why[1]) + item, k = self.predecessor(key, why) + return self.rule2func[self.new2old[rule]](attr) + + def ambiguity(self, rules): + # + # XXX - problem here and in collectRules() if the same rule + # appears in >1 method. Also undefined results if rules + # causing the ambiguity appear in the same method. + # + sortlist = [] + name2index = {} + for i in range(len(rules)): + lhs, rhs = rule = rules[i] + name = self.rule2name[self.new2old[rule]] + sortlist.append((len(rhs), name)) + name2index[name] = i + sortlist.sort() + list = map(lambda (a,b): b, sortlist) + return rules[name2index[self.resolve(list)]] + + def resolve(self, list): + # + # Resolve ambiguity in favor of the shortest RHS. + # Since we walk the tree from the top down, this + # should effectively resolve in favor of a "shift". + # + return list[0] + +# +# GenericASTBuilder automagically constructs a concrete/abstract syntax tree +# for a given input. The extra argument is a class (not an instance!) +# which supports the "__setslice__" and "__len__" methods. +# +# XXX - silently overrides any user code in methods. +# + +class GenericASTBuilder(GenericParser): + def __init__(self, AST, start): + GenericParser.__init__(self, start) + self.AST = AST + + def preprocess(self, rule, func): + rebind = lambda lhs, self=self: \ + lambda args, lhs=lhs, self=self: \ + self.buildASTNode(args, lhs) + lhs, rhs = rule + return rule, rebind(lhs) + + def buildASTNode(self, args, lhs): + children = [] + for arg in args: + if isinstance(arg, self.AST): + children.append(arg) + else: + children.append(self.terminal(arg)) + return self.nonterminal(lhs, children) + + def terminal(self, token): return token + + def nonterminal(self, type, args): + rv = self.AST(type) + rv[:len(args)] = args + return rv + +# +# GenericASTTraversal is a Visitor pattern according to Design Patterns. For +# each node it attempts to invoke the method n_<node type>, falling +# back onto the default() method if the n_* can't be found. The preorder +# traversal also looks for an exit hook named n_<node type>_exit (no default +# routine is called if it's not found). To prematurely halt traversal +# of a subtree, call the prune() method -- this only makes sense for a +# preorder traversal. Node type is determined via the typestring() method. +# + +class GenericASTTraversalPruningException: + pass + +class GenericASTTraversal: + def __init__(self, ast): + self.ast = ast + + def typestring(self, node): + return node.type + + def prune(self): + raise GenericASTTraversalPruningException + + def preorder(self, node=None): + if node is None: + node = self.ast + + try: + name = 'n_' + self.typestring(node) + if hasattr(self, name): + func = getattr(self, name) + func(node) + else: + self.default(node) + except GenericASTTraversalPruningException: + return + + for kid in node: + self.preorder(kid) + + name = name + '_exit' + if hasattr(self, name): + func = getattr(self, name) + func(node) + + def postorder(self, node=None): + if node is None: + node = self.ast + + for kid in node: + self.postorder(kid) + + name = 'n_' + self.typestring(node) + if hasattr(self, name): + func = getattr(self, name) + func(node) + else: + self.default(node) + + + def default(self, node): + pass + +# +# GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__" +# implemented. +# +# XXX - makes assumptions about how GenericParser walks the parse tree. +# + +class GenericASTMatcher(GenericParser): + def __init__(self, start, ast): + GenericParser.__init__(self, start) + self.ast = ast + + def preprocess(self, rule, func): + rebind = lambda func, self=self: \ + lambda args, func=func, self=self: \ + self.foundMatch(args, func) + lhs, rhs = rule + rhslist = list(rhs) + rhslist.reverse() + + return (lhs, tuple(rhslist)), rebind(func) + + def foundMatch(self, args, func): + func(args[-1]) + return args[-1] + + def match_r(self, node): + self.input.insert(0, node) + children = 0 + + for child in node: + if children == 0: + self.input.insert(0, '(') + children = children + 1 + self.match_r(child) + + if children > 0: + self.input.insert(0, ')') + + def match(self, ast=None): + if ast is None: + ast = self.ast + self.input = [] + + self.match_r(ast) + self.parse(self.input) + + def resolve(self, list): + # + # Resolve ambiguity in favor of the longest RHS. + # + return list[-1] + +def _dump(tokens, sets, states): + for i in range(len(sets)): + print 'set', i + for item in sets[i]: + print '\t', item + for (lhs, rhs), pos in states[item[0]].items: + print '\t\t', lhs, '::=', + print string.join(rhs[:pos]), + print '.', + print string.join(rhs[pos:]) + if i < len(tokens): + print + print 'token', str(tokens[i]) + print diff --git a/sys/src/cmd/python/Parser/tokenizer.c b/sys/src/cmd/python/Parser/tokenizer.c new file mode 100644 index 000000000..c58b6899b --- /dev/null +++ b/sys/src/cmd/python/Parser/tokenizer.c @@ -0,0 +1,1535 @@ + +/* Tokenizer implementation */ + +#include "Python.h" +#include "pgenheaders.h" + +#include <ctype.h> +#include <assert.h> + +#include "tokenizer.h" +#include "errcode.h" + +#ifndef PGEN +#include "unicodeobject.h" +#include "stringobject.h" +#include "fileobject.h" +#include "codecs.h" +#include "abstract.h" +#endif /* PGEN */ + +extern char *PyOS_Readline(FILE *, FILE *, char *); +/* Return malloc'ed string including trailing \n; + empty malloc'ed string for EOF; + NULL if interrupted */ + +/* Don't ever change this -- it would break the portability of Python code */ +#define TABSIZE 8 + +/* Convert a possibly signed character to a nonnegative int */ +/* XXX This assumes characters are 8 bits wide */ +#ifdef __CHAR_UNSIGNED__ +#define Py_CHARMASK(c) (c) +#else +#define Py_CHARMASK(c) ((c) & 0xff) +#endif + +/* Forward */ +static struct tok_state *tok_new(void); +static int tok_nextc(struct tok_state *tok); +static void tok_backup(struct tok_state *tok, int c); + +/* Token names */ + +char *_PyParser_TokenNames[] = { + "ENDMARKER", + "NAME", + "NUMBER", + "STRING", + "NEWLINE", + "INDENT", + "DEDENT", + "LPAR", + "RPAR", + "LSQB", + "RSQB", + "COLON", + "COMMA", + "SEMI", + "PLUS", + "MINUS", + "STAR", + "SLASH", + "VBAR", + "AMPER", + "LESS", + "GREATER", + "EQUAL", + "DOT", + "PERCENT", + "BACKQUOTE", + "LBRACE", + "RBRACE", + "EQEQUAL", + "NOTEQUAL", + "LESSEQUAL", + "GREATEREQUAL", + "TILDE", + "CIRCUMFLEX", + "LEFTSHIFT", + "RIGHTSHIFT", + "DOUBLESTAR", + "PLUSEQUAL", + "MINEQUAL", + "STAREQUAL", + "SLASHEQUAL", + "PERCENTEQUAL", + "AMPEREQUAL", + "VBAREQUAL", + "CIRCUMFLEXEQUAL", + "LEFTSHIFTEQUAL", + "RIGHTSHIFTEQUAL", + "DOUBLESTAREQUAL", + "DOUBLESLASH", + "DOUBLESLASHEQUAL", + "AT", + /* This table must match the #defines in token.h! */ + "OP", + "<ERRORTOKEN>", + "<N_TOKENS>" +}; + + +/* Create and initialize a new tok_state structure */ + +static struct tok_state * +tok_new(void) +{ + struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( + sizeof(struct tok_state)); + if (tok == NULL) + return NULL; + tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; + tok->done = E_OK; + tok->fp = NULL; + tok->tabsize = TABSIZE; + tok->indent = 0; + tok->indstack[0] = 0; + tok->atbol = 1; + tok->pendin = 0; + tok->prompt = tok->nextprompt = NULL; + tok->lineno = 0; + tok->level = 0; + tok->filename = NULL; + tok->altwarning = 0; + tok->alterror = 0; + tok->alttabsize = 1; + tok->altindstack[0] = 0; + tok->decoding_state = 0; + tok->decoding_erred = 0; + tok->read_coding_spec = 0; + tok->encoding = NULL; + tok->cont_line = 0; +#ifndef PGEN + tok->decoding_readline = NULL; + tok->decoding_buffer = NULL; +#endif + return tok; +} + +#ifdef PGEN + +static char * +decoding_fgets(char *s, int size, struct tok_state *tok) +{ + return fgets(s, size, tok->fp); +} + +static int +decoding_feof(struct tok_state *tok) +{ + return feof(tok->fp); +} + +static const char * +decode_str(const char *str, struct tok_state *tok) +{ + return str; +} + +#else /* PGEN */ + +static char * +error_ret(struct tok_state *tok) /* XXX */ +{ + tok->decoding_erred = 1; + if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ + PyMem_FREE(tok->buf); + tok->buf = NULL; + return NULL; /* as if it were EOF */ +} + +static char * +new_string(const char *s, Py_ssize_t len) +{ + char* result = (char *)PyMem_MALLOC(len + 1); + if (result != NULL) { + memcpy(result, s, len); + result[len] = '\0'; + } + return result; +} + +static char * +get_normal_name(char *s) /* for utf-8 and latin-1 */ +{ + char buf[13]; + int i; + for (i = 0; i < 12; i++) { + int c = s[i]; + if (c == '\0') break; + else if (c == '_') buf[i] = '-'; + else buf[i] = tolower(c); + } + buf[i] = '\0'; + if (strcmp(buf, "utf-8") == 0 || + strncmp(buf, "utf-8-", 6) == 0) return "utf-8"; + else if (strcmp(buf, "latin-1") == 0 || + strcmp(buf, "iso-8859-1") == 0 || + strcmp(buf, "iso-latin-1") == 0 || + strncmp(buf, "latin-1-", 8) == 0 || + strncmp(buf, "iso-8859-1-", 11) == 0 || + strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1"; + else return s; +} + +/* Return the coding spec in S, or NULL if none is found. */ + +static char * +get_coding_spec(const char *s, Py_ssize_t size) +{ + Py_ssize_t i; + /* Coding spec must be in a comment, and that comment must be + * the only statement on the source code line. */ + for (i = 0; i < size - 6; i++) { + if (s[i] == '#') + break; + if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') + return NULL; + } + for (; i < size - 6; i++) { /* XXX inefficient search */ + const char* t = s + i; + if (strncmp(t, "coding", 6) == 0) { + const char* begin = NULL; + t += 6; + if (t[0] != ':' && t[0] != '=') + continue; + do { + t++; + } while (t[0] == '\x20' || t[0] == '\t'); + + begin = t; + while (isalnum(Py_CHARMASK(t[0])) || + t[0] == '-' || t[0] == '_' || t[0] == '.') + t++; + + if (begin < t) { + char* r = new_string(begin, t - begin); + char* q = get_normal_name(r); + if (r != q) { + PyMem_FREE(r); + r = new_string(q, strlen(q)); + } + return r; + } + } + } + return NULL; +} + +/* Check whether the line contains a coding spec. If it does, + invoke the set_readline function for the new encoding. + This function receives the tok_state and the new encoding. + Return 1 on success, 0 on failure. */ + +static int +check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, + int set_readline(struct tok_state *, const char *)) +{ + char * cs; + int r = 1; + + if (tok->cont_line) + /* It's a continuation line, so it can't be a coding spec. */ + return 1; + cs = get_coding_spec(line, size); + if (cs != NULL) { + tok->read_coding_spec = 1; + if (tok->encoding == NULL) { + assert(tok->decoding_state == 1); /* raw */ + if (strcmp(cs, "utf-8") == 0 || + strcmp(cs, "iso-8859-1") == 0) { + tok->encoding = cs; + } else { +#ifdef Py_USING_UNICODE + r = set_readline(tok, cs); + if (r) { + tok->encoding = cs; + tok->decoding_state = -1; + } + else + PyMem_FREE(cs); +#else + /* Without Unicode support, we cannot + process the coding spec. Since there + won't be any Unicode literals, that + won't matter. */ + PyMem_FREE(cs); +#endif + } + } else { /* then, compare cs with BOM */ + r = (strcmp(tok->encoding, cs) == 0); + PyMem_FREE(cs); + } + } + if (!r) { + cs = tok->encoding; + if (!cs) + cs = "with BOM"; + PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); + } + return r; +} + +/* See whether the file starts with a BOM. If it does, + invoke the set_readline function with the new encoding. + Return 1 on success, 0 on failure. */ + +static int +check_bom(int get_char(struct tok_state *), + void unget_char(int, struct tok_state *), + int set_readline(struct tok_state *, const char *), + struct tok_state *tok) +{ + int ch = get_char(tok); + tok->decoding_state = 1; + if (ch == EOF) { + return 1; + } else if (ch == 0xEF) { + ch = get_char(tok); if (ch != 0xBB) goto NON_BOM; + ch = get_char(tok); if (ch != 0xBF) goto NON_BOM; +#if 0 + /* Disable support for UTF-16 BOMs until a decision + is made whether this needs to be supported. */ + } else if (ch == 0xFE) { + ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; + if (!set_readline(tok, "utf-16-be")) return 0; + tok->decoding_state = -1; + } else if (ch == 0xFF) { + ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; + if (!set_readline(tok, "utf-16-le")) return 0; + tok->decoding_state = -1; +#endif + } else { + unget_char(ch, tok); + return 1; + } + if (tok->encoding != NULL) + PyMem_FREE(tok->encoding); + tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ + return 1; + NON_BOM: + /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ + unget_char(0xFF, tok); /* XXX this will cause a syntax error */ + return 1; +} + +/* Read a line of text from TOK into S, using the stream in TOK. + Return NULL on failure, else S. + + On entry, tok->decoding_buffer will be one of: + 1) NULL: need to call tok->decoding_readline to get a new line + 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and + stored the result in tok->decoding_buffer + 3) PyStringObject *: previous call to fp_readl did not have enough room + (in the s buffer) to copy entire contents of the line read + by tok->decoding_readline. tok->decoding_buffer has the overflow. + In this case, fp_readl is called in a loop (with an expanded buffer) + until the buffer ends with a '\n' (or until the end of the file is + reached): see tok_nextc and its calls to decoding_fgets. +*/ + +static char * +fp_readl(char *s, int size, struct tok_state *tok) +{ +#ifndef Py_USING_UNICODE + /* In a non-Unicode built, this should never be called. */ + Py_FatalError("fp_readl should not be called in this build."); + return NULL; /* Keep compiler happy (not reachable) */ +#else + PyObject* utf8 = NULL; + PyObject* buf = tok->decoding_buffer; + char *str; + Py_ssize_t utf8len; + + /* Ask for one less byte so we can terminate it */ + assert(size > 0); + size--; + + if (buf == NULL) { + buf = PyObject_CallObject(tok->decoding_readline, NULL); + if (buf == NULL) + return error_ret(tok); + } else { + tok->decoding_buffer = NULL; + if (PyString_CheckExact(buf)) + utf8 = buf; + } + if (utf8 == NULL) { + utf8 = PyUnicode_AsUTF8String(buf); + Py_DECREF(buf); + if (utf8 == NULL) + return error_ret(tok); + } + str = PyString_AsString(utf8); + utf8len = PyString_GET_SIZE(utf8); + if (utf8len > size) { + tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); + if (tok->decoding_buffer == NULL) { + Py_DECREF(utf8); + return error_ret(tok); + } + utf8len = size; + } + memcpy(s, str, utf8len); + s[utf8len] = '\0'; + Py_DECREF(utf8); + if (utf8len == 0) return NULL; /* EOF */ + return s; +#endif +} + +/* Set the readline function for TOK to a StreamReader's + readline function. The StreamReader is named ENC. + + This function is called from check_bom and check_coding_spec. + + ENC is usually identical to the future value of tok->encoding, + except for the (currently unsupported) case of UTF-16. + + Return 1 on success, 0 on failure. */ + +static int +fp_setreadl(struct tok_state *tok, const char* enc) +{ + PyObject *reader, *stream, *readline; + + /* XXX: constify filename argument. */ + stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); + if (stream == NULL) + return 0; + + reader = PyCodec_StreamReader(enc, stream, NULL); + Py_DECREF(stream); + if (reader == NULL) + return 0; + + readline = PyObject_GetAttrString(reader, "readline"); + Py_DECREF(reader); + if (readline == NULL) + return 0; + + tok->decoding_readline = readline; + return 1; +} + +/* Fetch the next byte from TOK. */ + +static int fp_getc(struct tok_state *tok) { + return getc(tok->fp); +} + +/* Unfetch the last byte back into TOK. */ + +static void fp_ungetc(int c, struct tok_state *tok) { + ungetc(c, tok->fp); +} + +/* Read a line of input from TOK. Determine encoding + if necessary. */ + +static char * +decoding_fgets(char *s, int size, struct tok_state *tok) +{ + char *line = NULL; + int badchar = 0; + for (;;) { + if (tok->decoding_state < 0) { + /* We already have a codec associated with + this input. */ + line = fp_readl(s, size, tok); + break; + } else if (tok->decoding_state > 0) { + /* We want a 'raw' read. */ + line = Py_UniversalNewlineFgets(s, size, + tok->fp, NULL); + break; + } else { + /* We have not yet determined the encoding. + If an encoding is found, use the file-pointer + reader functions from now on. */ + if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) + return error_ret(tok); + assert(tok->decoding_state != 0); + } + } + if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { + if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { + return error_ret(tok); + } + } +#ifndef PGEN + /* The default encoding is ASCII, so make sure we don't have any + non-ASCII bytes in it. */ + if (line && !tok->encoding) { + unsigned char *c; + for (c = (unsigned char *)line; *c; c++) + if (*c > 127) { + badchar = *c; + break; + } + } + if (badchar) { + char buf[500]; + /* Need to add 1 to the line number, since this line + has not been counted, yet. */ + sprintf(buf, + "Non-ASCII character '\\x%.2x' " + "in file %.200s on line %i, " + "but no encoding declared; " + "see http://www.python.org/peps/pep-0263.html for details", + badchar, tok->filename, tok->lineno + 1); + PyErr_SetString(PyExc_SyntaxError, buf); + return error_ret(tok); + } +#endif + return line; +} + +static int +decoding_feof(struct tok_state *tok) +{ + if (tok->decoding_state >= 0) { + return feof(tok->fp); + } else { + PyObject* buf = tok->decoding_buffer; + if (buf == NULL) { + buf = PyObject_CallObject(tok->decoding_readline, NULL); + if (buf == NULL) { + error_ret(tok); + return 1; + } else { + tok->decoding_buffer = buf; + } + } + return PyObject_Length(buf) == 0; + } +} + +/* Fetch a byte from TOK, using the string buffer. */ + +static int +buf_getc(struct tok_state *tok) { + return Py_CHARMASK(*tok->str++); +} + +/* Unfetch a byte from TOK, using the string buffer. */ + +static void +buf_ungetc(int c, struct tok_state *tok) { + tok->str--; + assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ +} + +/* Set the readline function for TOK to ENC. For the string-based + tokenizer, this means to just record the encoding. */ + +static int +buf_setreadl(struct tok_state *tok, const char* enc) { + tok->enc = enc; + return 1; +} + +/* Return a UTF-8 encoding Python string object from the + C byte string STR, which is encoded with ENC. */ + +#ifdef Py_USING_UNICODE +static PyObject * +translate_into_utf8(const char* str, const char* enc) { + PyObject *utf8; + PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); + if (buf == NULL) + return NULL; + utf8 = PyUnicode_AsUTF8String(buf); + Py_DECREF(buf); + return utf8; +} +#endif + +/* Decode a byte string STR for use as the buffer of TOK. + Look for encoding declarations inside STR, and record them + inside TOK. */ + +static const char * +decode_str(const char *str, struct tok_state *tok) +{ + PyObject* utf8 = NULL; + const char *s; + int lineno = 0; + tok->enc = NULL; + tok->str = str; + if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) + return error_ret(tok); + str = tok->str; /* string after BOM if any */ + assert(str); +#ifdef Py_USING_UNICODE + if (tok->enc != NULL) { + utf8 = translate_into_utf8(str, tok->enc); + if (utf8 == NULL) + return error_ret(tok); + str = PyString_AsString(utf8); + } +#endif + for (s = str;; s++) { + if (*s == '\0') break; + else if (*s == '\n') { + lineno++; + if (lineno == 2) break; + } + } + tok->enc = NULL; + if (!check_coding_spec(str, s - str, tok, buf_setreadl)) + return error_ret(tok); +#ifdef Py_USING_UNICODE + if (tok->enc != NULL) { + assert(utf8 == NULL); + utf8 = translate_into_utf8(str, tok->enc); + if (utf8 == NULL) { + PyErr_Format(PyExc_SyntaxError, + "unknown encoding: %s", tok->enc); + return error_ret(tok); + } + str = PyString_AsString(utf8); + } +#endif + assert(tok->decoding_buffer == NULL); + tok->decoding_buffer = utf8; /* CAUTION */ + return str; +} + +#endif /* PGEN */ + +/* Set up tokenizer for string */ + +struct tok_state * +PyTokenizer_FromString(const char *str) +{ + struct tok_state *tok = tok_new(); + if (tok == NULL) + return NULL; + str = (char *)decode_str(str, tok); + if (str == NULL) { + PyTokenizer_Free(tok); + return NULL; + } + + /* XXX: constify members. */ + tok->buf = tok->cur = tok->end = tok->inp = (char*)str; + return tok; +} + + +/* Set up tokenizer for file */ + +struct tok_state * +PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) +{ + struct tok_state *tok = tok_new(); + if (tok == NULL) + return NULL; + if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { + PyTokenizer_Free(tok); + return NULL; + } + tok->cur = tok->inp = tok->buf; + tok->end = tok->buf + BUFSIZ; + tok->fp = fp; + tok->prompt = ps1; + tok->nextprompt = ps2; + return tok; +} + + +/* Free a tok_state structure */ + +void +PyTokenizer_Free(struct tok_state *tok) +{ + if (tok->encoding != NULL) + PyMem_FREE(tok->encoding); +#ifndef PGEN + Py_XDECREF(tok->decoding_readline); + Py_XDECREF(tok->decoding_buffer); +#endif + if (tok->fp != NULL && tok->buf != NULL) + PyMem_FREE(tok->buf); + PyMem_FREE(tok); +} + +#if !defined(PGEN) && defined(Py_USING_UNICODE) +static int +tok_stdin_decode(struct tok_state *tok, char **inp) +{ + PyObject *enc, *sysstdin, *decoded, *utf8; + const char *encoding; + char *converted; + + if (PySys_GetFile((char *)"stdin", NULL) != stdin) + return 0; + sysstdin = PySys_GetObject("stdin"); + if (sysstdin == NULL || !PyFile_Check(sysstdin)) + return 0; + + enc = ((PyFileObject *)sysstdin)->f_encoding; + if (enc == NULL || !PyString_Check(enc)) + return 0; + Py_INCREF(enc); + + encoding = PyString_AsString(enc); + decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); + if (decoded == NULL) + goto error_clear; + + utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); + Py_DECREF(decoded); + if (utf8 == NULL) + goto error_clear; + + assert(PyString_Check(utf8)); + converted = new_string(PyString_AS_STRING(utf8), + PyString_GET_SIZE(utf8)); + Py_DECREF(utf8); + if (converted == NULL) + goto error_nomem; + + PyMem_FREE(*inp); + *inp = converted; + if (tok->encoding != NULL) + PyMem_FREE(tok->encoding); + tok->encoding = new_string(encoding, strlen(encoding)); + if (tok->encoding == NULL) + goto error_nomem; + + Py_DECREF(enc); + return 0; + +error_nomem: + Py_DECREF(enc); + tok->done = E_NOMEM; + return -1; + +error_clear: + /* Fallback to iso-8859-1: for backward compatibility */ + Py_DECREF(enc); + PyErr_Clear(); + return 0; +} +#endif + +/* Get next char, updating state; error code goes into tok->done */ + +static int +tok_nextc(register struct tok_state *tok) +{ + for (;;) { + if (tok->cur != tok->inp) { + return Py_CHARMASK(*tok->cur++); /* Fast path */ + } + if (tok->done != E_OK) + return EOF; + if (tok->fp == NULL) { + char *end = strchr(tok->inp, '\n'); + if (end != NULL) + end++; + else { + end = strchr(tok->inp, '\0'); + if (end == tok->inp) { + tok->done = E_EOF; + return EOF; + } + } + if (tok->start == NULL) + tok->buf = tok->cur; + tok->line_start = tok->cur; + tok->lineno++; + tok->inp = end; + return Py_CHARMASK(*tok->cur++); + } + if (tok->prompt != NULL) { + char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); + if (tok->nextprompt != NULL) + tok->prompt = tok->nextprompt; + if (newtok == NULL) + tok->done = E_INTR; + else if (*newtok == '\0') { + PyMem_FREE(newtok); + tok->done = E_EOF; + } +#if !defined(PGEN) && defined(Py_USING_UNICODE) + else if (tok_stdin_decode(tok, &newtok) != 0) + PyMem_FREE(newtok); +#endif + else if (tok->start != NULL) { + size_t start = tok->start - tok->buf; + size_t oldlen = tok->cur - tok->buf; + size_t newlen = oldlen + strlen(newtok); + char *buf = tok->buf; + buf = (char *)PyMem_REALLOC(buf, newlen+1); + tok->lineno++; + if (buf == NULL) { + PyMem_FREE(tok->buf); + tok->buf = NULL; + PyMem_FREE(newtok); + tok->done = E_NOMEM; + return EOF; + } + tok->buf = buf; + tok->cur = tok->buf + oldlen; + tok->line_start = tok->cur; + strcpy(tok->buf + oldlen, newtok); + PyMem_FREE(newtok); + tok->inp = tok->buf + newlen; + tok->end = tok->inp + 1; + tok->start = tok->buf + start; + } + else { + tok->lineno++; + if (tok->buf != NULL) + PyMem_FREE(tok->buf); + tok->buf = newtok; + tok->line_start = tok->buf; + tok->cur = tok->buf; + tok->line_start = tok->buf; + tok->inp = strchr(tok->buf, '\0'); + tok->end = tok->inp + 1; + } + } + else { + int done = 0; + Py_ssize_t cur = 0; + char *pt; + if (tok->start == NULL) { + if (tok->buf == NULL) { + tok->buf = (char *) + PyMem_MALLOC(BUFSIZ); + if (tok->buf == NULL) { + tok->done = E_NOMEM; + return EOF; + } + tok->end = tok->buf + BUFSIZ; + } + if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), + tok) == NULL) { + tok->done = E_EOF; + done = 1; + } + else { + tok->done = E_OK; + tok->inp = strchr(tok->buf, '\0'); + done = tok->inp[-1] == '\n'; + } + } + else { + cur = tok->cur - tok->buf; + if (decoding_feof(tok)) { + tok->done = E_EOF; + done = 1; + } + else + tok->done = E_OK; + } + tok->lineno++; + /* Read until '\n' or EOF */ + while (!done) { + Py_ssize_t curstart = tok->start == NULL ? -1 : + tok->start - tok->buf; + Py_ssize_t curvalid = tok->inp - tok->buf; + Py_ssize_t newsize = curvalid + BUFSIZ; + char *newbuf = tok->buf; + newbuf = (char *)PyMem_REALLOC(newbuf, + newsize); + if (newbuf == NULL) { + tok->done = E_NOMEM; + tok->cur = tok->inp; + return EOF; + } + tok->buf = newbuf; + tok->inp = tok->buf + curvalid; + tok->end = tok->buf + newsize; + tok->start = curstart < 0 ? NULL : + tok->buf + curstart; + if (decoding_fgets(tok->inp, + (int)(tok->end - tok->inp), + tok) == NULL) { + /* Break out early on decoding + errors, as tok->buf will be NULL + */ + if (tok->decoding_erred) + return EOF; + /* Last line does not end in \n, + fake one */ + strcpy(tok->inp, "\n"); + } + tok->inp = strchr(tok->inp, '\0'); + done = tok->inp[-1] == '\n'; + } + if (tok->buf != NULL) { + tok->cur = tok->buf + cur; + tok->line_start = tok->cur; + /* replace "\r\n" with "\n" */ + /* For Mac leave the \r, giving syntax error */ + pt = tok->inp - 2; + if (pt >= tok->buf && *pt == '\r') { + *pt++ = '\n'; + *pt = '\0'; + tok->inp = pt; + } + } + } + if (tok->done != E_OK) { + if (tok->prompt != NULL) + PySys_WriteStderr("\n"); + tok->cur = tok->inp; + return EOF; + } + } + /*NOTREACHED*/ +} + + +/* Back-up one character */ + +static void +tok_backup(register struct tok_state *tok, register int c) +{ + if (c != EOF) { + if (--tok->cur < tok->buf) + Py_FatalError("tok_backup: begin of buffer"); + if (*tok->cur != c) + *tok->cur = c; + } +} + + +/* Return the token corresponding to a single character */ + +int +PyToken_OneChar(int c) +{ + switch (c) { + case '(': return LPAR; + case ')': return RPAR; + case '[': return LSQB; + case ']': return RSQB; + case ':': return COLON; + case ',': return COMMA; + case ';': return SEMI; + case '+': return PLUS; + case '-': return MINUS; + case '*': return STAR; + case '/': return SLASH; + case '|': return VBAR; + case '&': return AMPER; + case '<': return LESS; + case '>': return GREATER; + case '=': return EQUAL; + case '.': return DOT; + case '%': return PERCENT; + case '`': return BACKQUOTE; + case '{': return LBRACE; + case '}': return RBRACE; + case '^': return CIRCUMFLEX; + case '~': return TILDE; + case '@': return AT; + default: return OP; + } +} + + +int +PyToken_TwoChars(int c1, int c2) +{ + switch (c1) { + case '=': + switch (c2) { + case '=': return EQEQUAL; + } + break; + case '!': + switch (c2) { + case '=': return NOTEQUAL; + } + break; + case '<': + switch (c2) { + case '>': return NOTEQUAL; + case '=': return LESSEQUAL; + case '<': return LEFTSHIFT; + } + break; + case '>': + switch (c2) { + case '=': return GREATEREQUAL; + case '>': return RIGHTSHIFT; + } + break; + case '+': + switch (c2) { + case '=': return PLUSEQUAL; + } + break; + case '-': + switch (c2) { + case '=': return MINEQUAL; + } + break; + case '*': + switch (c2) { + case '*': return DOUBLESTAR; + case '=': return STAREQUAL; + } + break; + case '/': + switch (c2) { + case '/': return DOUBLESLASH; + case '=': return SLASHEQUAL; + } + break; + case '|': + switch (c2) { + case '=': return VBAREQUAL; + } + break; + case '%': + switch (c2) { + case '=': return PERCENTEQUAL; + } + break; + case '&': + switch (c2) { + case '=': return AMPEREQUAL; + } + break; + case '^': + switch (c2) { + case '=': return CIRCUMFLEXEQUAL; + } + break; + } + return OP; +} + +int +PyToken_ThreeChars(int c1, int c2, int c3) +{ + switch (c1) { + case '<': + switch (c2) { + case '<': + switch (c3) { + case '=': + return LEFTSHIFTEQUAL; + } + break; + } + break; + case '>': + switch (c2) { + case '>': + switch (c3) { + case '=': + return RIGHTSHIFTEQUAL; + } + break; + } + break; + case '*': + switch (c2) { + case '*': + switch (c3) { + case '=': + return DOUBLESTAREQUAL; + } + break; + } + break; + case '/': + switch (c2) { + case '/': + switch (c3) { + case '=': + return DOUBLESLASHEQUAL; + } + break; + } + break; + } + return OP; +} + +static int +indenterror(struct tok_state *tok) +{ + if (tok->alterror) { + tok->done = E_TABSPACE; + tok->cur = tok->inp; + return 1; + } + if (tok->altwarning) { + PySys_WriteStderr("%s: inconsistent use of tabs and spaces " + "in indentation\n", tok->filename); + tok->altwarning = 0; + } + return 0; +} + + +/* Get next token, after space stripping etc. */ + +static int +tok_get(register struct tok_state *tok, char **p_start, char **p_end) +{ + register int c; + int blankline; + + *p_start = *p_end = NULL; + nextline: + tok->start = NULL; + blankline = 0; + + /* Get indentation level */ + if (tok->atbol) { + register int col = 0; + register int altcol = 0; + tok->atbol = 0; + for (;;) { + c = tok_nextc(tok); + if (c == ' ') + col++, altcol++; + else if (c == '\t') { + col = (col/tok->tabsize + 1) * tok->tabsize; + altcol = (altcol/tok->alttabsize + 1) + * tok->alttabsize; + } + else if (c == '\014') /* Control-L (formfeed) */ + col = altcol = 0; /* For Emacs users */ + else + break; + } + tok_backup(tok, c); + if (c == '#' || c == '\n') { + /* Lines with only whitespace and/or comments + shouldn't affect the indentation and are + not passed to the parser as NEWLINE tokens, + except *totally* empty lines in interactive + mode, which signal the end of a command group. */ + if (col == 0 && c == '\n' && tok->prompt != NULL) + blankline = 0; /* Let it through */ + else + blankline = 1; /* Ignore completely */ + /* We can't jump back right here since we still + may need to skip to the end of a comment */ + } + if (!blankline && tok->level == 0) { + if (col == tok->indstack[tok->indent]) { + /* No change */ + if (altcol != tok->altindstack[tok->indent]) { + if (indenterror(tok)) + return ERRORTOKEN; + } + } + else if (col > tok->indstack[tok->indent]) { + /* Indent -- always one */ + if (tok->indent+1 >= MAXINDENT) { + tok->done = E_TOODEEP; + tok->cur = tok->inp; + return ERRORTOKEN; + } + if (altcol <= tok->altindstack[tok->indent]) { + if (indenterror(tok)) + return ERRORTOKEN; + } + tok->pendin++; + tok->indstack[++tok->indent] = col; + tok->altindstack[tok->indent] = altcol; + } + else /* col < tok->indstack[tok->indent] */ { + /* Dedent -- any number, must be consistent */ + while (tok->indent > 0 && + col < tok->indstack[tok->indent]) { + tok->pendin--; + tok->indent--; + } + if (col != tok->indstack[tok->indent]) { + tok->done = E_DEDENT; + tok->cur = tok->inp; + return ERRORTOKEN; + } + if (altcol != tok->altindstack[tok->indent]) { + if (indenterror(tok)) + return ERRORTOKEN; + } + } + } + } + + tok->start = tok->cur; + + /* Return pending indents/dedents */ + if (tok->pendin != 0) { + if (tok->pendin < 0) { + tok->pendin++; + return DEDENT; + } + else { + tok->pendin--; + return INDENT; + } + } + + again: + tok->start = NULL; + /* Skip spaces */ + do { + c = tok_nextc(tok); + } while (c == ' ' || c == '\t' || c == '\014'); + + /* Set start of current token */ + tok->start = tok->cur - 1; + + /* Skip comment, while looking for tab-setting magic */ + if (c == '#') { + static char *tabforms[] = { + "tab-width:", /* Emacs */ + ":tabstop=", /* vim, full form */ + ":ts=", /* vim, abbreviated form */ + "set tabsize=", /* will vi never die? */ + /* more templates can be added here to support other editors */ + }; + char cbuf[80]; + char *tp, **cp; + tp = cbuf; + do { + *tp++ = c = tok_nextc(tok); + } while (c != EOF && c != '\n' && + (size_t)(tp - cbuf + 1) < sizeof(cbuf)); + *tp = '\0'; + for (cp = tabforms; + cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); + cp++) { + if ((tp = strstr(cbuf, *cp))) { + int newsize = atoi(tp + strlen(*cp)); + + if (newsize >= 1 && newsize <= 40) { + tok->tabsize = newsize; + if (Py_VerboseFlag) + PySys_WriteStderr( + "Tab size set to %d\n", + newsize); + } + } + } + while (c != EOF && c != '\n') + c = tok_nextc(tok); + } + + /* Check for EOF and errors now */ + if (c == EOF) { + return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; + } + + /* Identifier (most frequent token!) */ + if (isalpha(c) || c == '_') { + /* Process r"", u"" and ur"" */ + switch (c) { + case 'r': + case 'R': + c = tok_nextc(tok); + if (c == '"' || c == '\'') + goto letter_quote; + break; + case 'u': + case 'U': + c = tok_nextc(tok); + if (c == 'r' || c == 'R') + c = tok_nextc(tok); + if (c == '"' || c == '\'') + goto letter_quote; + break; + } + while (isalnum(c) || c == '_') { + c = tok_nextc(tok); + } + tok_backup(tok, c); + *p_start = tok->start; + *p_end = tok->cur; + return NAME; + } + + /* Newline */ + if (c == '\n') { + tok->atbol = 1; + if (blankline || tok->level > 0) + goto nextline; + *p_start = tok->start; + *p_end = tok->cur - 1; /* Leave '\n' out of the string */ + tok->cont_line = 0; + return NEWLINE; + } + + /* Period or number starting with period? */ + if (c == '.') { + c = tok_nextc(tok); + if (isdigit(c)) { + goto fraction; + } + else { + tok_backup(tok, c); + *p_start = tok->start; + *p_end = tok->cur; + return DOT; + } + } + + /* Number */ + if (isdigit(c)) { + if (c == '0') { + /* Hex or octal -- maybe. */ + c = tok_nextc(tok); + if (c == '.') + goto fraction; +#ifndef WITHOUT_COMPLEX + if (c == 'j' || c == 'J') + goto imaginary; +#endif + if (c == 'x' || c == 'X') { + /* Hex */ + do { + c = tok_nextc(tok); + } while (isxdigit(c)); + } + else { + int found_decimal = 0; + /* Octal; c is first char of it */ + /* There's no 'isoctdigit' macro, sigh */ + while ('0' <= c && c < '8') { + c = tok_nextc(tok); + } + if (isdigit(c)) { + found_decimal = 1; + do { + c = tok_nextc(tok); + } while (isdigit(c)); + } + if (c == '.') + goto fraction; + else if (c == 'e' || c == 'E') + goto exponent; +#ifndef WITHOUT_COMPLEX + else if (c == 'j' || c == 'J') + goto imaginary; +#endif + else if (found_decimal) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + } + if (c == 'l' || c == 'L') + c = tok_nextc(tok); + } + else { + /* Decimal */ + do { + c = tok_nextc(tok); + } while (isdigit(c)); + if (c == 'l' || c == 'L') + c = tok_nextc(tok); + else { + /* Accept floating point numbers. */ + if (c == '.') { + fraction: + /* Fraction */ + do { + c = tok_nextc(tok); + } while (isdigit(c)); + } + if (c == 'e' || c == 'E') { + exponent: + /* Exponent part */ + c = tok_nextc(tok); + if (c == '+' || c == '-') + c = tok_nextc(tok); + if (!isdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while (isdigit(c)); + } +#ifndef WITHOUT_COMPLEX + if (c == 'j' || c == 'J') + /* Imaginary part */ + imaginary: + c = tok_nextc(tok); +#endif + } + } + tok_backup(tok, c); + *p_start = tok->start; + *p_end = tok->cur; + return NUMBER; + } + + letter_quote: + /* String */ + if (c == '\'' || c == '"') { + Py_ssize_t quote2 = tok->cur - tok->start + 1; + int quote = c; + int triple = 0; + int tripcount = 0; + for (;;) { + c = tok_nextc(tok); + if (c == '\n') { + if (!triple) { + tok->done = E_EOLS; + tok_backup(tok, c); + return ERRORTOKEN; + } + tripcount = 0; + tok->cont_line = 1; /* multiline string. */ + } + else if (c == EOF) { + if (triple) + tok->done = E_EOFS; + else + tok->done = E_EOLS; + tok->cur = tok->inp; + return ERRORTOKEN; + } + else if (c == quote) { + tripcount++; + if (tok->cur - tok->start == quote2) { + c = tok_nextc(tok); + if (c == quote) { + triple = 1; + tripcount = 0; + continue; + } + tok_backup(tok, c); + } + if (!triple || tripcount == 3) + break; + } + else if (c == '\\') { + tripcount = 0; + c = tok_nextc(tok); + if (c == EOF) { + tok->done = E_EOLS; + tok->cur = tok->inp; + return ERRORTOKEN; + } + } + else + tripcount = 0; + } + *p_start = tok->start; + *p_end = tok->cur; + return STRING; + } + + /* Line continuation */ + if (c == '\\') { + c = tok_nextc(tok); + if (c != '\n') { + tok->done = E_LINECONT; + tok->cur = tok->inp; + return ERRORTOKEN; + } + tok->cont_line = 1; + goto again; /* Read next line */ + } + + /* Check for two-character token */ + { + int c2 = tok_nextc(tok); + int token = PyToken_TwoChars(c, c2); + if (token != OP) { + int c3 = tok_nextc(tok); + int token3 = PyToken_ThreeChars(c, c2, c3); + if (token3 != OP) { + token = token3; + } else { + tok_backup(tok, c3); + } + *p_start = tok->start; + *p_end = tok->cur; + return token; + } + tok_backup(tok, c2); + } + + /* Keep track of parentheses nesting level */ + switch (c) { + case '(': + case '[': + case '{': + tok->level++; + break; + case ')': + case ']': + case '}': + tok->level--; + break; + } + + /* Punctuation character */ + *p_start = tok->start; + *p_end = tok->cur; + return PyToken_OneChar(c); +} + +int +PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) +{ + int result = tok_get(tok, p_start, p_end); + if (tok->decoding_erred) { + result = ERRORTOKEN; + tok->done = E_DECODE; + } + return result; +} + +#ifdef Py_DEBUG + +void +tok_dump(int type, char *start, char *end) +{ + printf("%s", _PyParser_TokenNames[type]); + if (type == NAME || type == NUMBER || type == STRING || type == OP) + printf("(%.*s)", (int)(end - start), start); +} + +#endif diff --git a/sys/src/cmd/python/Parser/tokenizer.h b/sys/src/cmd/python/Parser/tokenizer.h new file mode 100644 index 000000000..5e7ebf74f --- /dev/null +++ b/sys/src/cmd/python/Parser/tokenizer.h @@ -0,0 +1,65 @@ +#ifndef Py_TOKENIZER_H +#define Py_TOKENIZER_H +#ifdef __cplusplus +extern "C" { +#endif + +#include "object.h" + +/* Tokenizer interface */ + +#include "token.h" /* For token types */ + +#define MAXINDENT 100 /* Max indentation level */ + +/* Tokenizer state */ +struct tok_state { + /* Input state; buf <= cur <= inp <= end */ + /* NB an entire line is held in the buffer */ + char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ + char *cur; /* Next character in buffer */ + char *inp; /* End of data in buffer */ + char *end; /* End of input buffer if buf != NULL */ + char *start; /* Start of current token if not NULL */ + int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ + /* NB If done != E_OK, cur must be == inp!!! */ + FILE *fp; /* Rest of input; NULL if tokenizing a string */ + int tabsize; /* Tab spacing */ + int indent; /* Current indentation index */ + int indstack[MAXINDENT]; /* Stack of indents */ + int atbol; /* Nonzero if at begin of new line */ + int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ + char *prompt, *nextprompt; /* For interactive prompting */ + int lineno; /* Current line number */ + int level; /* () [] {} Parentheses nesting level */ + /* Used to allow free continuations inside them */ + /* Stuff for checking on different tab sizes */ + const char *filename; /* For error messages */ + int altwarning; /* Issue warning if alternate tabs don't match */ + int alterror; /* Issue error if alternate tabs don't match */ + int alttabsize; /* Alternate tab spacing */ + int altindstack[MAXINDENT]; /* Stack of alternate indents */ + /* Stuff for PEP 0263 */ + int decoding_state; /* -1:decoding, 0:init, 1:raw */ + int decoding_erred; /* whether erred in decoding */ + int read_coding_spec; /* whether 'coding:...' has been read */ + char *encoding; + int cont_line; /* whether we are in a continuation line. */ + const char* line_start; /* pointer to start of current line */ +#ifndef PGEN + PyObject *decoding_readline; /* codecs.open(...).readline */ + PyObject *decoding_buffer; +#endif + const char* enc; + const char* str; +}; + +extern struct tok_state *PyTokenizer_FromString(const char *); +extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *); +extern void PyTokenizer_Free(struct tok_state *); +extern int PyTokenizer_Get(struct tok_state *, char **, char **); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_TOKENIZER_H */ diff --git a/sys/src/cmd/python/Parser/tokenizer_pgen.c b/sys/src/cmd/python/Parser/tokenizer_pgen.c new file mode 100644 index 000000000..9cb8492d6 --- /dev/null +++ b/sys/src/cmd/python/Parser/tokenizer_pgen.c @@ -0,0 +1,2 @@ +#define PGEN +#include "tokenizer.c" |