From 8c6632636807c35bee40210ed8483c1eca82664f Mon Sep 17 00:00:00 2001 From: Eric Smith Date: Sat, 25 Aug 2007 02:26:07 +0000 Subject: [PATCH] Implementation of PEP 3101, Advanced String Formatting. Known issues: The string.Formatter class, as discussed in the PEP, is incomplete. Error handling needs to conform to the PEP. Need to fix this warning that I introduced in Python/formatter_unicode.c: Objects/stringlib/unicodedefs.h:26: warning: `STRINGLIB_CMP' defined but not used Need to make sure sign formatting is correct, more tests needed. Need to remove '()' sign formatting, left over from an earlier version of the PEP. --- Include/formatter_unicode.h | 9 + Include/unicodeobject.h | 5 + Lib/string.py | 39 ++ Lib/test/test_builtin.py | 26 + Lib/test/test_descrtut.py | 1 + Lib/test/test_float.py | 34 +- Lib/test/test_long.py | 44 ++ Lib/test/test_string.py | 8 + Lib/test/test_unicode.py | 212 +++++++ Makefile.pre.in | 15 + Objects/floatobject.c | 19 + Objects/longobject.c | 13 + Objects/stringlib/formatter.h | 966 ++++++++++++++++++++++++++++++ Objects/stringlib/string_format.h | 831 +++++++++++++++++++++++++ Objects/stringlib/stringdefs.h | 23 + Objects/stringlib/unicodedefs.h | 32 + Objects/typeobject.c | 41 ++ Objects/unicodeobject.c | 246 +++++++- Python/Python-ast.c | 4 +- Python/bltinmodule.c | 56 ++ Python/formatter_unicode.c | 13 + Python/sysmodule.c | 50 ++ 22 files changed, 2669 insertions(+), 18 deletions(-) create mode 100644 Include/formatter_unicode.h create mode 100644 Objects/stringlib/formatter.h create mode 100644 Objects/stringlib/string_format.h create mode 100644 Objects/stringlib/stringdefs.h create mode 100644 Objects/stringlib/unicodedefs.h create mode 100644 Python/formatter_unicode.c diff --git a/Include/formatter_unicode.h b/Include/formatter_unicode.h new file mode 100644 index 00000000000..ccca883dd01 --- /dev/null +++ b/Include/formatter_unicode.h @@ -0,0 +1,9 @@ +PyObject * +unicode_unicode__format__(PyObject *self, PyObject *args); + +PyObject * +unicode_long__format__(PyObject *self, PyObject *args); + +PyObject * +unicode_float__format__(PyObject *self, PyObject *args); + diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 4cde46a892f..f940887a852 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1437,6 +1437,11 @@ PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( const Py_UNICODE *s, Py_UNICODE c ); +PyObject * +_unicodeformatter_iterator(PyObject *str); +PyObject * +_unicodeformatter_lookup(PyObject *field_name, PyObject *args, + PyObject *kwargs); #ifdef __cplusplus } diff --git a/Lib/string.py b/Lib/string.py index 87073aab4db..9df78afd1f6 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -189,3 +189,42 @@ def convert(mo): raise ValueError('Unrecognized named group in pattern', self.pattern) return self.pattern.sub(convert, self.template) + + + +######################################################################## +# the Formatter class +# see PEP 3101 for details and purpose of this class + +# The hard parts are reused from the C implementation. They're +# exposed here via the sys module. sys was chosen because it's always +# available and doesn't have to be dynamically loaded. + +# The parser is implemented in sys._formatter_parser. +# The "object lookup" is implemented in sys._formatter_lookup + +from sys import _formatter_parser, _formatter_lookup + +class Formatter: + def format(self, format_string, *args, **kwargs): + return self.vformat(format_string, args, kwargs) + + def vformat(self, format_string, args, kwargs): + result = [] + for (is_markup, literal, field_name, format_spec, conversion) in \ + _formatter_parser(format_string): + if is_markup: + # find the object + index, name, obj = _formatter_lookup(field_name, args, kwargs) + else: + result.append(literal) + return ''.join(result) + + def get_value(self, key, args, kwargs): + pass + + def check_unused_args(self, used_args, args, kwargs): + pass + + def format_field(self, value, format_spec): + pass diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index f77cf78707d..0560045d034 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -517,6 +517,32 @@ def __float__(self): self.assertAlmostEqual(float(Foo3(21)), 42.) self.assertRaises(TypeError, float, Foo4(42)) + def test_format(self): + class A: + def __init__(self, x): + self.x = x + def __format__(self, format_spec): + return str(self.x) + format_spec + + # class that returns a bad type from __format__ + class H: + def __format__(self, format_spec): + return 1.0 + + self.assertEqual(format(3, ''), '3') + self.assertEqual(format(A(3), 'spec'), '3spec') + + # for builtin types, format(x, "") == str(x) + self.assertEqual(format(17**13, ""), str(17**13)) + self.assertEqual(format(1.0, ""), str(1.0)) + self.assertEqual(format(3.1415e104, ""), str(3.1415e104)) + self.assertEqual(format(-3.1415e104, ""), str(-3.1415e104)) + self.assertEqual(format(3.1415e-104, ""), str(3.1415e-104)) + self.assertEqual(format(-3.1415e-104, ""), str(-3.1415e-104)) + self.assertEqual(format(object, ""), str(object)) + + #self.assertRaises(TypeError, format, H(), "") + def test_getattr(self): import sys self.assert_(getattr(sys, 'stdout') is sys.stdout) diff --git a/Lib/test/test_descrtut.py b/Lib/test/test_descrtut.py index fe29f34bd44..d2f9720604a 100644 --- a/Lib/test/test_descrtut.py +++ b/Lib/test/test_descrtut.py @@ -173,6 +173,7 @@ def merge(self, other): '__delslice__', '__doc__', '__eq__', + '__format__', '__ge__', '__getattribute__', '__getitem__', diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py index 48abec95f56..e5a4537885c 100644 --- a/Lib/test/test_float.py +++ b/Lib/test/test_float.py @@ -114,12 +114,44 @@ def neg_neg(): self.assertEquals(pos_pos(), neg_pos()) self.assertEquals(pos_neg(), neg_neg()) +class FormatTestCase(unittest.TestCase): + def testFormat(self): + # these should be rewritten to use both format(x, spec) and + # x.__format__(spec) + + self.assertEqual(format(0.0, 'f'), '0.000000') + + # the default is 'g', except for empty format spec + self.assertEqual(format(0.0, ''), '0.0') + self.assertEqual(format(0.01, ''), '0.01') + self.assertEqual(format(0.01, 'g'), '0.01') + + self.assertEqual(format(0, 'f'), '0.000000') + + self.assertEqual(format(1.0, 'f'), '1.000000') + self.assertEqual(format(1, 'f'), '1.000000') + + self.assertEqual(format(-1.0, 'f'), '-1.000000') + self.assertEqual(format(-1, 'f'), '-1.000000') + + self.assertEqual(format( 1.0, ' f'), ' 1.000000') + self.assertEqual(format(-1.0, ' f'), '-1.000000') + self.assertEqual(format( 1.0, '+f'), '+1.000000') + self.assertEqual(format(-1.0, '+f'), '-1.000000') + + # % formatting + self.assertEqual(format(-1.0, '%'), '-100.000000%') + + # conversion to string should fail + self.assertRaises(ValueError, format, 3.0, "s") + def test_main(): test_support.run_unittest( FormatFunctionsTestCase, UnknownFormatTestCase, - IEEEFormatTestCase) + IEEEFormatTestCase, + FormatTestCase) if __name__ == '__main__': test_main() diff --git a/Lib/test/test_long.py b/Lib/test/test_long.py index 0b67c3e5041..4e15340f6c0 100644 --- a/Lib/test/test_long.py +++ b/Lib/test/test_long.py @@ -493,6 +493,50 @@ def __cmp__(self, other): eq(x > y, Rcmp > 0, Frm("%r > %r %d", x, y, Rcmp)) eq(x >= y, Rcmp >= 0, Frm("%r >= %r %d", x, y, Rcmp)) + def test_format(self): + self.assertEqual(format(123456789, 'd'), '123456789') + self.assertEqual(format(123456789, 'd'), '123456789') + + # hex + self.assertEqual(format(3, "x"), "3") + self.assertEqual(format(3, "X"), "3") + self.assertEqual(format(1234, "x"), "4d2") + self.assertEqual(format(-1234, "x"), "-4d2") + self.assertEqual(format(1234, "8x"), " 4d2") +# XXX fix self.assertEqual(format(-1234, "8x"), " -4d2") + self.assertEqual(format(1234, "x"), "4d2") + self.assertEqual(format(-1234, "x"), "-4d2") + self.assertEqual(format(-3, "x"), "-3") + self.assertEqual(format(-3, "X"), "-3") + self.assertEqual(format(int('be', 16), "x"), "be") + self.assertEqual(format(int('be', 16), "X"), "BE") + self.assertEqual(format(-int('be', 16), "x"), "-be") + self.assertEqual(format(-int('be', 16), "X"), "-BE") + + # octal + self.assertEqual(format(3, "b"), "11") + self.assertEqual(format(-3, "b"), "-11") + self.assertEqual(format(1234, "b"), "10011010010") + self.assertEqual(format(-1234, "b"), "-10011010010") + self.assertEqual(format(1234, "-b"), "10011010010") + self.assertEqual(format(-1234, "-b"), "-10011010010") + self.assertEqual(format(1234, " b"), " 10011010010") + self.assertEqual(format(-1234, " b"), "-10011010010") + self.assertEqual(format(1234, "+b"), "+10011010010") + self.assertEqual(format(-1234, "+b"), "-10011010010") + + # conversion to float + self.assertEqual(format(0, 'f'), '0.000000') + + # make sure these are errors + self.assertRaises(ValueError, format, 3, "1.3") # precision disallowed + return + self.assertRaises(ValueError, format, 3, "+c") # sign not allowed + # with 'c' + self.assertRaises(ValueError, format, 3, "R") # bogus format type + # conversion to string should fail + self.assertRaises(ValueError, format, 3, "s") + def test_main(): test_support.run_unittest(LongTest) diff --git a/Lib/test/test_string.py b/Lib/test/test_string.py index 3b21ebc854c..ce9fe231537 100644 --- a/Lib/test/test_string.py +++ b/Lib/test/test_string.py @@ -15,6 +15,14 @@ def test_attrs(self): string.punctuation string.printable + def test_formatter(self): + fmt = string.Formatter() + self.assertEqual(fmt.format("foo"), "foo") + + # Formatter not working you for lookups + #self.assertEqual(fmt.format("foo{0}", "bar"), "foobar") + + def test_maketrans(self): transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377' diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 4b582ded0a2..ff0e0152f0f 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -357,6 +357,218 @@ def test_contains(self): self.assertRaises(TypeError, "abc".__contains__) + def test_format(self): + self.assertEqual(''.format(), '') + self.assertEqual('a'.format(), 'a') + self.assertEqual('ab'.format(), 'ab') + self.assertEqual('a{{'.format(), 'a{') + self.assertEqual('a}}'.format(), 'a}') + self.assertEqual('{{b'.format(), '{b') + self.assertEqual('}}b'.format(), '}b') + self.assertEqual('a{{b'.format(), 'a{b') + + # examples from the PEP: + import datetime + self.assertEqual("My name is {0}".format('Fred'), "My name is Fred") + self.assertEqual("My name is {0[name]}".format(dict(name='Fred')), + "My name is Fred") + self.assertEqual("My name is {0} :-{{}}".format('Fred'), + "My name is Fred :-{}") + + d = datetime.date(2007, 8, 18) + self.assertEqual("The year is {0.year}".format(d), + "The year is 2007") + + #"{0!r:20}".format("Hello") + + # classes we'll use for testing + class C: + def __init__(self, x=100): + self._x = x + def __format__(self, spec): + return spec + + class D: + def __init__(self, x): + self.x = x + def __format__(self, spec): + return str(self.x) + + # class with __str__, but no __format__ + class E: + def __init__(self, x): + self.x = x + def __str__(self): + return 'E(' + self.x + ')' + + # class with __repr__, but no __format__ or __str__ + class F: + def __init__(self, x): + self.x = x + def __repr__(self): + return 'F(' + self.x + ')' + + # class with __format__ that forwards to string, for some format_spec's + class G: + def __init__(self, x): + self.x = x + def __str__(self): + return "string is " + self.x + def __format__(self, format_spec): + if format_spec == 'd': + return 'G(' + self.x + ')' + return object.__format__(self, format_spec) + + # class that returns a bad type from __format__ + class H: + def __format__(self, format_spec): + return 1.0 + + + self.assertEqual(''.format(), '') + self.assertEqual('abc'.format(), 'abc') + self.assertEqual('{0}'.format('abc'), 'abc') + self.assertEqual('{0:}'.format('abc'), 'abc') +# self.assertEqual('{ 0 }'.format('abc'), 'abc') + self.assertEqual('X{0}'.format('abc'), 'Xabc') + self.assertEqual('{0}X'.format('abc'), 'abcX') + self.assertEqual('X{0}Y'.format('abc'), 'XabcY') + self.assertEqual('{1}'.format(1, 'abc'), 'abc') + self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc') + self.assertEqual('{1}X'.format(1, 'abc'), 'abcX') + self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY') + self.assertEqual('{0}'.format(-15), '-15') + self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc') + self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc') + self.assertEqual('{{'.format(), '{') + self.assertEqual('}}'.format(), '}') + self.assertEqual('{{}}'.format(), '{}') + self.assertEqual('{{x}}'.format(), '{x}') + self.assertEqual('{{{0}}}'.format(123), '{123}') + self.assertEqual('{{{{0}}}}'.format(), '{{0}}') + self.assertEqual('}}{{'.format(), '}{') + self.assertEqual('}}x{{'.format(), '}x{') + + self.assertEqual('{foo._x}'.format(foo=C(20)), '20') + self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010') + self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc') + self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc') + self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def') + self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def') + self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def') + + # I'm not sure if this should work, or if it's a problem if it does work + #'{0[_{foo}]}'.format({'_FOO': 'abc'}, foo='FOO') + #('{0[{foo}{bar}]}'.format({'FOOBAR': 'abc'}, foo='FOO', bar='BAR') + + # format specifiers for built in types + + # strings + self.assertEqual('{0:.3s}'.format('abc'), 'abc') + self.assertEqual('{0:.3s}'.format('ab'), 'ab') + self.assertEqual('{0:.3s}'.format('abcdef'), 'abc') + self.assertEqual('{0:.0s}'.format('abcdef'), '') + self.assertEqual('{0:3.3s}'.format('abc'), 'abc') + self.assertEqual('{0:2.3s}'.format('abc'), 'abc') + self.assertEqual('{0:2.2s}'.format('abc'), 'ab') + self.assertEqual('{0:3.2s}'.format('abc'), 'ab ') + self.assertEqual('{0:x<0s}'.format('result'), 'result') + self.assertEqual('{0:x<5s}'.format('result'), 'result') + self.assertEqual('{0:x<6s}'.format('result'), 'result') + self.assertEqual('{0:x<7s}'.format('result'), 'resultx') + self.assertEqual('{0:x<8s}'.format('result'), 'resultxx') + self.assertEqual('{0: <7s}'.format('result'), 'result ') + self.assertEqual('{0:<7s}'.format('result'), 'result ') + self.assertEqual('{0:>7s}'.format('result'), ' result') + self.assertEqual('{0:>8s}'.format('result'), ' result') + self.assertEqual('{0:^8s}'.format('result'), ' result ') + self.assertEqual('{0:^9s}'.format('result'), ' result ') + self.assertEqual('{0:^10s}'.format('result'), ' result ') + self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999) + self.assertEqual('{0:10000}'.format(''), ' ' * 10000) + self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000) + + # format specifiers for user defined type + self.assertEqual('{0:abc}'.format(C()), 'abc') + + # !r and !s coersions + self.assertEqual('{0!s}'.format('Hello'), 'Hello') + self.assertEqual('{0!s:}'.format('Hello'), 'Hello') + self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ') + self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ') + self.assertEqual('{0!r}'.format('Hello'), "'Hello'") + self.assertEqual('{0!r:}'.format('Hello'), "'Hello'") + self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)') + + # XXX should pass, but currently don't + # format(object, "") + + # test fallback to object.__format__ + self.assertEqual('{0}'.format({}), '{}') + self.assertEqual('{0}'.format([]), '[]') + self.assertEqual('{0}'.format([1]), '[1]') + self.assertEqual('{0}'.format(E('data')), 'E(data)') + self.assertEqual('{0:^10}'.format(E('data')), ' E(data) ') + self.assertEqual('{0:^10s}'.format(E('data')), ' E(data) ') + self.assertEqual('{0:d}'.format(G('data')), 'G(data)') + self.assertEqual('{0:>15s}'.format(G('data')), ' string is data') + self.assertEqual('{0!s}'.format(G('data')), 'string is data') + + # string format specifiers + self.assertEqual('{0:}'.format('a'), 'a') + + # computed format specifiers + self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello') + self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello') + self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello') + self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ') + self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ') + + # test various errors + self.assertRaises(ValueError, '{'.format) + self.assertRaises(ValueError, '}'.format) + self.assertRaises(ValueError, 'a{'.format) + self.assertRaises(ValueError, 'a}'.format) + self.assertRaises(ValueError, '{a'.format) + self.assertRaises(ValueError, '}a'.format) + self.assertRaises(ValueError, '{0}'.format) + self.assertRaises(ValueError, '{1}'.format, 'abc') + self.assertRaises(ValueError, '{x}'.format) + self.assertRaises(ValueError, "}{".format) + self.assertRaises(ValueError, "{".format) + self.assertRaises(ValueError, "}".format) + self.assertRaises(ValueError, "abc{0:{}".format) + self.assertRaises(ValueError, "{0".format) + self.assertRaises(ValueError, "{0.[]}".format) + self.assertRaises(ValueError, "{0[0}".format) + self.assertRaises(ValueError, "{0[0:foo}".format) + self.assertRaises(ValueError, "{c]}".format) + self.assertRaises(ValueError, "{{ {{{0}}".format) + self.assertRaises(ValueError, "{0}}".format) + self.assertRaises(ValueError, "{foo}".format, bar=3) + self.assertRaises(ValueError, "{0!x}".format, 3) + self.assertRaises(ValueError, "{0!}".format) + self.assertRaises(ValueError, "{0!rs}".format) + self.assertRaises(ValueError, "{!}".format) + self.assertRaises(ValueError, "{:}".format) + self.assertRaises(ValueError, "{}".format) + + # can't have a replacement on the field name portion + self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4) + + # exceed maximum recursion depth + self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '') + self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format, + 0, 1, 2, 3, 4, 5, 6, 7) + + # string format spec errors + self.assertRaises(ValueError, "{0:-s}".format, '') + self.assertRaises(ValueError, format, "", "-") + self.assertRaises(ValueError, "{0:=s}".format, '') + + # check that __format__ returns a string + #self.assertRaises(TypeError, "{0}".format, H()) + def test_formatting(self): string_tests.MixinStrUnicodeUserStringTest.test_formatting(self) # Testing Unicode formatting strings... diff --git a/Makefile.pre.in b/Makefile.pre.in index 4e9bab11fd2..53012e892d5 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -271,6 +271,7 @@ PYTHON_OBJS= \ Python/traceback.o \ Python/getopt.o \ Python/pystrtod.o \ + Python/formatter_unicode.o \ Python/$(DYNLOADFILE) \ $(MACHDEP_OBJS) \ $(THREADOBJ) @@ -503,6 +504,19 @@ Python/importdl.o: $(srcdir)/Python/importdl.c Objects/unicodectype.o: $(srcdir)/Objects/unicodectype.c \ $(srcdir)/Objects/unicodetype_db.h +Objects/unicodeobject.o: $(srcdir)/Objects/unicodeobject.c \ + $(srcdir)/Objects/stringlib/string_format.h \ + $(srcdir)/Objects/stringlib/unicodedefs.h \ + $(srcdir)/Objects/stringlib/fastsearch.h \ + $(srcdir)/Objects/stringlib/count.h \ + $(srcdir)/Objects/stringlib/find.h \ + $(srcdir)/Objects/stringlib/partition.h + +Python/formatter_unicode.o: $(srcdir)/Python/formatter_unicode.c \ + $(srcdir)/Objects/stringlib/formatter.h + + + ############################################################################ # Header files @@ -527,6 +541,7 @@ PYTHON_HEADERS= \ Include/genobject.h \ Include/fileobject.h \ Include/floatobject.h \ + Include/formatter_unicode.h \ Include/funcobject.h \ Include/import.h \ Include/intobject.h \ diff --git a/Objects/floatobject.c b/Objects/floatobject.c index 09efa12c65d..ca94750f027 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -6,6 +6,8 @@ #include "Python.h" +#include "formatter_unicode.h" + #include #if !defined(__STDC__) @@ -1015,6 +1017,21 @@ float_getzero(PyObject *v, void *closure) return PyFloat_FromDouble(0.0); } +static PyObject * +float__format__(PyObject *self, PyObject *args) +{ + /* when back porting this to 2.6, check type of the format_spec + and call either unicode_long__format__ or + string_long__format__ */ + return unicode_float__format__(self, args); +} + +PyDoc_STRVAR(float__format__doc, +"float.__format__(format_spec) -> string\n" +"\n" +"Formats the float according to format_spec."); + + static PyMethodDef float_methods[] = { {"conjugate", (PyCFunction)float_float, METH_NOARGS, "Returns self, the complex conjugate of any float."}, @@ -1028,6 +1045,8 @@ static PyMethodDef float_methods[] = { METH_O|METH_CLASS, float_getformat_doc}, {"__setformat__", (PyCFunction)float_setformat, METH_VARARGS|METH_CLASS, float_setformat_doc}, + {"__format__", (PyCFunction)float__format__, + METH_VARARGS, float__format__doc}, {NULL, NULL} /* sentinel */ }; diff --git a/Objects/longobject.c b/Objects/longobject.c index ddf359d0eac..b724edf3e07 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -5,6 +5,8 @@ #include "Python.h" #include "longintrepr.h" +#include "formatter_unicode.h" + #include long @@ -3592,6 +3594,16 @@ long_getN(PyLongObject *v, void *context) { return PyLong_FromLong((intptr_t)context); } +static PyObject * +long__format__(PyObject *self, PyObject *args) +{ + /* when back porting this to 2.6, check type of the format_spec + and call either unicode_long__format__ or + string_long__format__ */ + return unicode_long__format__(self, args); +} + + static PyObject * long_round(PyObject *self, PyObject *args) { @@ -3632,6 +3644,7 @@ static PyMethodDef long_methods[] = { "Rounding an Integral returns itself.\n" "Rounding with an ndigits arguments defers to float.__round__."}, {"__getnewargs__", (PyCFunction)long_getnewargs, METH_NOARGS}, + {"__format__", (PyCFunction)long__format__, METH_VARARGS}, {NULL, NULL} /* sentinel */ }; diff --git a/Objects/stringlib/formatter.h b/Objects/stringlib/formatter.h new file mode 100644 index 00000000000..b65244d7697 --- /dev/null +++ b/Objects/stringlib/formatter.h @@ -0,0 +1,966 @@ +/* implements the string, long, and float formatters. that is, + string.__format__, etc. */ + +/* Before including this, you must include either: + stringlib/unicodedefs.h + stringlib/stringdefs.h + + Also, you should define the names: + FORMAT_STRING + FORMAT_LONG + FORMAT_FLOAT + to be whatever you want the public names of these functions to + be. These are the only non-static functions defined here. +*/ + +/* + get_integer consumes 0 or more decimal digit characters from an + input string, updates *result with the corresponding positive + integer, and returns the number of digits consumed. + + returns -1 on error. +*/ +static int +get_integer(STRINGLIB_CHAR **ptr, STRINGLIB_CHAR *end, + Py_ssize_t *result) +{ + Py_ssize_t accumulator, digitval, oldaccumulator; + int numdigits; + accumulator = numdigits = 0; + for (;;(*ptr)++, numdigits++) { + if (*ptr >= end) + break; + digitval = STRINGLIB_TODECIMAL(**ptr); + if (digitval < 0) + break; + /* + This trick was copied from old Unicode format code. It's cute, + but would really suck on an old machine with a slow divide + implementation. Fortunately, in the normal case we do not + expect too many digits. + */ + oldaccumulator = accumulator; + accumulator *= 10; + if ((accumulator+10)/10 != oldaccumulator+1) { + PyErr_Format(PyExc_ValueError, + "Too many decimal digits in format string"); + return -1; + } + accumulator += digitval; + } + *result = accumulator; + return numdigits; +} + +/************************************************************************/ +/*********** standard format specifier parsing **************************/ +/************************************************************************/ + +/* returns true if this character is a specifier alignment token */ +Py_LOCAL_INLINE(int) +is_alignment_token(STRINGLIB_CHAR c) +{ + switch (c) { + case '<': case '>': case '=': case '^': + return 1; + default: + return 0; + } +} + +/* returns true if this character is a sign element */ +Py_LOCAL_INLINE(int) +is_sign_element(STRINGLIB_CHAR c) +{ + switch (c) { + case ' ': case '+': case '-': case '(': + return 1; + default: + return 0; + } +} + + +typedef struct { + STRINGLIB_CHAR fill_char; + STRINGLIB_CHAR align; + STRINGLIB_CHAR sign; + Py_ssize_t width; + Py_ssize_t precision; + STRINGLIB_CHAR type; +} InternalFormatSpec; + +/* + ptr points to the start of the format_spec, end points just past its end. + fills in format with the parsed information. + returns 1 on success, 0 on failure. + if failure, sets the exception +*/ +static int +parse_internal_render_format_spec(PyObject *format_spec, + InternalFormatSpec *format, + char default_type) +{ + STRINGLIB_CHAR *ptr = STRINGLIB_STR(format_spec); + STRINGLIB_CHAR *end = ptr + STRINGLIB_LEN(format_spec); + + /* end-ptr is used throughout this code to specify the length of + the input string */ + + Py_ssize_t specified_width; + + format->fill_char = '\0'; + format->align = '\0'; + format->sign = '\0'; + format->width = -1; + format->precision = -1; + format->type = default_type; + + /* If the second char is an alignment token, + then parse the fill char */ + if (end-ptr >= 2 && is_alignment_token(ptr[1])) { + format->align = ptr[1]; + format->fill_char = ptr[0]; + ptr += 2; + } else if (end-ptr >= 1 && is_alignment_token(ptr[0])) { + format->align = ptr[0]; + ptr++; + } + + /* Parse the various sign options */ + if (end-ptr >= 1 && is_sign_element(ptr[0])) { + format->sign = ptr[0]; + ptr++; + if (end-ptr >= 1 && ptr[0] == ')') { + ptr++; + } + } + + /* The special case for 0-padding (backwards compat) */ + if (format->fill_char == '\0' && + end-ptr >= 1 && ptr[0] == '0') { + format->fill_char = '0'; + if (format->align == '\0') { + format->align = '='; + } + ptr++; + } + + /* XXX add error checking */ + specified_width = get_integer(&ptr, end, &format->width); + + /* if specified_width is 0, we didn't consume any characters for + the width. in that case, reset the width to -1, because + get_integer() will have set it to zero */ + if (specified_width == 0) { + format->width = -1; + } + + /* Parse field precision */ + if (end-ptr && ptr[0] == '.') { + ptr++; + + /* XXX add error checking */ + specified_width = get_integer(&ptr, end, &format->precision); + + /* not having a precision after a dot is an error */ + if (specified_width == 0) { + PyErr_Format(PyExc_ValueError, + "Format specifier missing precision"); + return 0; + } + + } + + /* Finally, parse the type field */ + + if (end-ptr > 1) { + /* invalid conversion spec */ + PyErr_Format(PyExc_ValueError, "Invalid conversion specification"); + return 0; + } + + if (end-ptr == 1) { + format->type = ptr[0]; + ptr++; + } + + return 1; +} + + +/************************************************************************/ +/*********** common routines for numeric formatting *********************/ +/************************************************************************/ + +/* describes the layout for an integer, see the comment in + _calc_integer_widths() for details */ +typedef struct { + Py_ssize_t n_lpadding; + Py_ssize_t n_spadding; + Py_ssize_t n_rpadding; + char lsign; + Py_ssize_t n_lsign; + char rsign; + Py_ssize_t n_rsign; + Py_ssize_t n_total; /* just a convenience, it's derivable from the + other fields */ +} NumberFieldWidths; + +/* not all fields of format are used. for example, precision is + unused. should this take discrete params in order to be more clear + about what it does? or is passing a single format parameter easier + and more efficient enough to justify a little obfuscation? */ +static void +calc_number_widths(NumberFieldWidths *r, STRINGLIB_CHAR actual_sign, + Py_ssize_t n_digits, const InternalFormatSpec *format) +{ + r->n_lpadding = 0; + r->n_spadding = 0; + r->n_rpadding = 0; + r->lsign = '\0'; + r->n_lsign = 0; + r->rsign = '\0'; + r->n_rsign = 0; + + /* the output will look like: + | | + | | + | | + + lsign and rsign are computed from format->sign and the actual + sign of the number + + digits is already known + + the total width is either given, or computed from the + actual digits + + only one of lpadding, spadding, and rpadding can be non-zero, + and it's calculated from the width and other fields + */ + + /* compute the various parts we're going to write */ + if (format->sign == '+') { + /* always put a + or - */ + r->n_lsign = 1; + r->lsign = (actual_sign == '-' ? '-' : '+'); + } else if (format->sign == '(') { + if (actual_sign == '-') { + r->n_lsign = 1; + r->lsign = '('; + r->n_rsign = 1; + r->rsign = ')'; + } + } else if (format->sign == ' ') { + r->n_lsign = 1; + r->lsign = (actual_sign == '-' ? '-' : ' '); + } else { + /* non specified, or the default (-) */ + if (actual_sign == '-') { + r->n_lsign = 1; + r->lsign = '-'; + } + } + + /* now the number of padding characters */ + if (format->width == -1) { + /* no padding at all, nothing to do */ + } else { + /* see if any padding is needed */ + if (r->n_lsign + n_digits + r->n_rsign >= format->width) { + /* no padding needed, we're already bigger than the + requested width */ + } else { + /* determine which of left, space, or right padding is + needed */ + Py_ssize_t padding = format->width - (r->n_lsign + n_digits + r->n_rsign); + if (format->align == '<') + r->n_rpadding = padding; + else if (format->align == '>') + r->n_lpadding = padding; + else if (format->align == '^') { + r->n_lpadding = padding / 2; + r->n_rpadding = padding - r->n_lpadding; + } else + /* must be '=' */ + r->n_spadding = padding; + } + } + r->n_total = r->n_lpadding + r->n_lsign + r->n_spadding + + n_digits + r->n_rsign + r->n_rpadding; +} + +/* fill in the non-digit parts of a numbers's string representation, + as determined in _calc_integer_widths(). returns the pointer to + where the digits go. */ +static STRINGLIB_CHAR * +fill_number(STRINGLIB_CHAR *p_buf, const NumberFieldWidths *spec, + Py_ssize_t n_digits, STRINGLIB_CHAR fill_char) +{ + STRINGLIB_CHAR* p_digits; + + if (spec->n_lpadding) { + STRINGLIB_FILL(p_buf, fill_char, spec->n_lpadding); + p_buf += spec->n_lpadding; + } + if (spec->n_lsign == 1) { + *p_buf++ = spec->lsign; + } + if (spec->n_spadding) { + STRINGLIB_FILL(p_buf, fill_char, spec->n_spadding); + p_buf += spec->n_spadding; + } + p_digits = p_buf; + p_buf += n_digits; + if (spec->n_rsign == 1) { + *p_buf++ = spec->rsign; + } + if (spec->n_rpadding) { + STRINGLIB_FILL(p_buf, fill_char, spec->n_rpadding); + p_buf += spec->n_rpadding; + } + return p_digits; +} + +/************************************************************************/ +/*********** string formatting ******************************************/ +/************************************************************************/ + +static PyObject * +format_string_internal(PyObject *value, const InternalFormatSpec *format) +{ + Py_ssize_t width; /* total field width */ + Py_ssize_t lpad; + STRINGLIB_CHAR *dst; + STRINGLIB_CHAR *src = STRINGLIB_STR(value); + Py_ssize_t len = STRINGLIB_LEN(value); + PyObject *result = NULL; + + /* sign is not allowed on strings */ + if (format->sign != '\0') { + PyErr_SetString(PyExc_ValueError, + "Sign not allowed in string format specifier"); + goto done; + } + + /* '=' alignment not allowed on strings */ + if (format->align == '=') { + PyErr_SetString(PyExc_ValueError, + "'=' alignment not allowed " + "in string format specifier"); + goto done; + } + + /* if precision is specified, output no more that format.precision + characters */ + if (format->precision >= 0 && len >= format->precision) { + len = format->precision; + } + + if (format->width >= 0) { + width = format->width; + + /* but use at least len characters */ + if (len > width) { + width = len; + } + } else { + /* not specified, use all of the chars and no more */ + width = len; + } + + /* allocate the resulting string */ + result = STRINGLIB_NEW(NULL, width); + if (result == NULL) + goto done; + + /* now write into that space */ + dst = STRINGLIB_STR(result); + + /* figure out how much leading space we need, based on the + aligning */ + if (format->align == '>') + lpad = width - len; + else if (format->align == '^') + lpad = (width - len) / 2; + else + lpad = 0; + + /* if right aligning, increment the destination allow space on the + left */ + memcpy(dst + lpad, src, len * sizeof(STRINGLIB_CHAR)); + + /* do any padding */ + if (width > len) { + STRINGLIB_CHAR fill_char = format->fill_char; + if (fill_char == '\0') { + /* use the default, if not specified */ + fill_char = ' '; + } + + /* pad on left */ + if (lpad) + STRINGLIB_FILL(dst, fill_char, lpad); + + /* pad on right */ + if (width - len - lpad) + STRINGLIB_FILL(dst + len + lpad, fill_char, width - len - lpad); + } + +done: + return result; +} + + +/************************************************************************/ +/*********** long formatting ********************************************/ +/************************************************************************/ + +static PyObject * +format_long_internal(PyObject *value, const InternalFormatSpec *format) +{ + PyObject *result = NULL; + int total_leading_chars_to_skip = 0; /* also includes sign, if + present */ + STRINGLIB_CHAR sign = '\0'; + STRINGLIB_CHAR *p; + Py_ssize_t n_digits; /* count of digits need from the computed + string */ + Py_ssize_t len; + Py_ssize_t tmp; + NumberFieldWidths spec; + long x; + + /* no precision allowed on integers */ + if (format->precision != -1) { + PyErr_SetString(PyExc_ValueError, + "Precision not allowed in integer format specifier"); + goto done; + } + + + /* special case for character formatting */ + if (format->type == 'c') { + /* error to specify a sign */ + if (format->sign != '\0') { + PyErr_SetString(PyExc_ValueError, + "Sign not allowed with integer" + " format specifier 'c'"); + goto done; + } + + /* taken from unicodeobject.c formatchar() */ + /* Integer input truncated to a character */ + x = PyInt_AsLong(value); + if (x == -1 && PyErr_Occurred()) + goto done; +#ifdef Py_UNICODE_WIDE + if (x < 0 || x > 0x10ffff) { + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x110000) " + "(wide Python build)"); + goto done; + } +#else + if (x < 0 || x > 0xffff) { + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x10000) " + "(narrow Python build)"); + goto done; + } +#endif + result = STRINGLIB_NEW(NULL, 1); + if (result == NULL) + goto done; + p = STRINGLIB_STR(result); + p[0] = (Py_UNICODE) x; + n_digits = len = 1; + } else { + int base; + int format_leading_chars_to_skip; /* characters added by + PyNumber_ToBase that we + want to skip over. + instead of using them, + we'll compute our + own. */ + /* compute the base and how many characters will be added by + PyNumber_ToBase */ + switch (format->type) { + case 'b': + base = 2; + format_leading_chars_to_skip = 2; /* 0b */ + break; + case 'o': + base = 8; + format_leading_chars_to_skip = 2; /* 0o */ + break; + case 'x': + case 'X': + base = 16; + format_leading_chars_to_skip = 2; /* 0x */ + break; + default: /* shouldn't be needed, but stops a compiler warning */ + case 'd': + base = 10; + format_leading_chars_to_skip = 0; + break; + } + + /* do the hard part, converting to a string in a given base */ + result = PyNumber_ToBase(value, base); + if (result == NULL) + goto done; + + n_digits = STRINGLIB_LEN(result); + len = n_digits; + p = STRINGLIB_STR(result); + + /* if X, convert to uppercase */ + if (format->type == 'X') + for (tmp = 0; tmp < len; tmp++) + p[tmp] = STRINGLIB_TOUPPER(p[tmp]); + + /* is a sign character present in the output? if so, remember it + and skip it */ + sign = p[0]; + if (sign == '-') { + total_leading_chars_to_skip += 1; + n_digits--; + } + + /* skip over the leading digits (0x, 0b, etc.) */ + assert(n_digits >= format_leading_chars_to_skip + 1); + n_digits -= format_leading_chars_to_skip; + total_leading_chars_to_skip += format_leading_chars_to_skip; + } + + calc_number_widths(&spec, sign, n_digits, format); + + /* if the buffer is getting bigger, realloc it. if it's getting + smaller, don't realloc because we need to move the results + around first. realloc after we've done that */ + + if (spec.n_total > len) { + if (STRINGLIB_RESIZE(&result, spec.n_total) < 0) + goto done; + /* recalc, because string might have moved */ + p = STRINGLIB_STR(result); + } + + /* copy the characters into position first, since we're going to + overwrite some of that space */ + /* we need to move if the number of left padding in the output is + different from the number of characters we need to skip */ + if ((spec.n_lpadding + spec.n_lsign + spec.n_spadding) != + total_leading_chars_to_skip) { + memmove(p + (spec.n_lpadding + spec.n_lsign + spec.n_spadding), + p + total_leading_chars_to_skip, + n_digits * sizeof(STRINGLIB_CHAR)); + } + + /* now fill in the non-digit parts */ + fill_number(p, &spec, n_digits, + format->fill_char == '\0' ? ' ' : format->fill_char); + + /* if we're getting smaller, realloc now */ + if (spec.n_total < len) { + if (STRINGLIB_RESIZE(&result, spec.n_total) < 0) + goto done; + } + +done: + return result; +} + + +/************************************************************************/ +/*********** float formatting *******************************************/ +/************************************************************************/ + +/* taken from unicodeobject.c */ +static Py_ssize_t +strtounicode(Py_UNICODE *buffer, const char *charbuffer) +{ + register Py_ssize_t i; + Py_ssize_t len = strlen(charbuffer); + for (i = len - 1; i >= 0; i--) + buffer[i] = (Py_UNICODE) charbuffer[i]; + + return len; +} + +/* the callback function to call to do the actual float formatting. + it matches the definition of PyOS_ascii_formatd */ +typedef char* +(*DoubleSnprintfFunction)(char *buffer, size_t buf_len, + const char *format, double d); + +/* just a wrapper to make PyOS_snprintf look like DoubleSnprintfFunction */ +static char* +snprintf_double(char *buffer, size_t buf_len, const char *format, double d) +{ + PyOS_snprintf(buffer, buf_len, format, d); + return NULL; +} + +/* see FORMATBUFLEN in unicodeobject.c */ +#define FLOAT_FORMATBUFLEN 120 + +/* much of this is taken from unicodeobject.c */ +/* use type instead of format->type, so that it can be overridden by + format_number() */ +static PyObject * +_format_float(STRINGLIB_CHAR type, PyObject *value, + const InternalFormatSpec *format, + DoubleSnprintfFunction snprintf) +{ + /* fmt = '%.' + `prec` + `type` + '%%' + worst case length = 2 + 10 (len of INT_MAX) + 1 + 2 = 15 (use 20)*/ + char fmt[20]; + + /* taken from unicodeobject.c */ + /* Worst case length calc to ensure no buffer overrun: + + 'g' formats: + fmt = %#.g + buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp + for any double rep.) + len = 1 + prec + 1 + 2 + 5 = 9 + prec + + 'f' formats: + buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) + len = 1 + 50 + 1 + prec = 52 + prec + + If prec=0 the effective precision is 1 (the leading digit is + always given), therefore increase the length by one. + + */ + char charbuf[FLOAT_FORMATBUFLEN]; + Py_ssize_t n_digits; + double x; + Py_ssize_t precision = format->precision; + PyObject *result = NULL; + STRINGLIB_CHAR sign; + char* trailing = ""; + STRINGLIB_CHAR *p; + NumberFieldWidths spec; + +#if STRINGLIB_IS_UNICODE + Py_UNICODE unicodebuf[FLOAT_FORMATBUFLEN]; +#endif + + /* first, do the conversion as 8-bit chars, using the platform's + snprintf. then, if needed, convert to unicode. */ + + /* 'F' is the same as 'f', per the PEP */ + if (type == 'F') + type = 'f'; + + x = PyFloat_AsDouble(value); + + if (x == -1.0 && PyErr_Occurred()) + goto done; + + if (type == '%') { + type = 'f'; + x *= 100; + trailing = "%"; + } + + if (precision < 0) + precision = 6; + if (type == 'f' && (fabs(x) / 1e25) >= 1e25) + type = 'g'; + + /* cast "type", because if we're in unicode we need to pass a + 8-bit char. this is safe, because we've restricted what "type" + can be */ + PyOS_snprintf(fmt, sizeof(fmt), "%%.%zd%c", precision, (char)type); + + /* call the passed in function to do the actual formatting */ + snprintf(charbuf, sizeof(charbuf), fmt, x); + + /* adding trailing to fmt with PyOS_snprintf doesn't work, not + sure why. we'll just concatentate it here, no harm done. we + know we can't have a buffer overflow from the fmt size + analysis */ + strcat(charbuf, trailing); + + /* rather than duplicate the code for snprintf for both unicode + and 8 bit strings, we just use the 8 bit version and then + convert to unicode in a separate code path. that's probably + the lesser of 2 evils. */ +#if STRINGLIB_IS_UNICODE + n_digits = strtounicode(unicodebuf, charbuf); + p = unicodebuf; +#else + /* compute the length. I believe this is done because the return + value from snprintf above is unreliable */ + n_digits = strlen(charbuf); + p = charbuf; +#endif + + /* is a sign character present in the output? if so, remember it + and skip it */ + sign = p[0]; + if (sign == '-') { + p++; + n_digits--; + } + + calc_number_widths(&spec, sign, n_digits, format); + + /* allocate a string with enough space */ + result = STRINGLIB_NEW(NULL, spec.n_total); + if (result == NULL) + goto done; + + /* fill in the non-digit parts */ + fill_number(STRINGLIB_STR(result), &spec, n_digits, + format->fill_char == '\0' ? ' ' : format->fill_char); + + /* fill in the digit parts */ + memmove(STRINGLIB_STR(result) + (spec.n_lpadding + spec.n_lsign + spec.n_spadding), + p, + n_digits * sizeof(STRINGLIB_CHAR)); + +done: + return result; +} + +static PyObject * +format_float_internal(PyObject *value, const InternalFormatSpec *format) +{ + if (format->type == 'n') + return _format_float('f', value, format, snprintf_double); + else + return _format_float(format->type, value, format, PyOS_ascii_formatd); +} + +/************************************************************************/ +/*********** built in formatters ****************************************/ +/************************************************************************/ + +PyObject * +FORMAT_STRING(PyObject* value, PyObject* args) +{ + PyObject *format_spec; + PyObject *tmp = NULL; + PyObject *result = NULL; + InternalFormatSpec format; + + if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) + goto done; + if (!STRINGLIB_CHECK(format_spec)) { + PyErr_SetString(PyExc_TypeError, STRINGLIB_TYPE_NAME " object required"); + goto done; + } + + /* check for the special case of zero length format spec, make + it equivalent to str(value) */ + if (STRINGLIB_LEN(format_spec) == 0) { + result = STRINGLIB_TOSTR(value); + goto done; + } + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, &format, 's')) + goto done; + + /* type conversion? */ + switch (format.type) { + case 's': + /* no type conversion needed, already a string. do the formatting */ + result = format_string_internal(value, &format); + break; +#if 0 + case 'b': + case 'c': + case 'd': + case 'o': + case 'x': + case 'X': + /* convert to integer */ + /* XXX: make a stringlib function to do this when backporting, + since FromUnicode differs from FromString */ + tmp = PyLong_FromUnicode(STRINGLIB_STR(value), STRINGLIB_LEN(value), 0); + if (tmp == NULL) + goto done; + result = format_long_internal(tmp, &format); + break; + + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case 'n': + case '%': + /* convert to float */ + tmp = PyFloat_FromString(value); + if (tmp == NULL) + goto done; + result = format_float_internal(tmp, &format); + break; +#endif + default: + /* unknown */ + PyErr_Format(PyExc_ValueError, "Unknown conversion type %c", + format.type); + goto done; + } + +done: + Py_XDECREF(tmp); + return result; +} + +PyObject * +FORMAT_LONG(PyObject* value, PyObject* args) +{ + PyObject *format_spec; + PyObject *result = NULL; + PyObject *tmp = NULL; + InternalFormatSpec format; + + if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) + goto done; + if (!STRINGLIB_CHECK(format_spec)) { + PyErr_SetString(PyExc_TypeError, STRINGLIB_TYPE_NAME " object required"); + goto done; + } + + /* check for the special case of zero length format spec, make + it equivalent to str(value) */ + if (STRINGLIB_LEN(format_spec) == 0) { + result = STRINGLIB_TOSTR(value); + goto done; + } + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, &format, 'd')) + goto done; + + /* type conversion? */ + switch (format.type) { +#if 0 + case 's': + /* convert to string/unicode */ + tmp = STRINGLIB_TOSTR(value); + if (tmp == NULL) + goto done; + result = format_string_internal(tmp, &format); + break; +#endif + case 'b': + case 'c': + case 'd': + case 'o': + case 'x': + case 'X': + /* no type conversion needed, already an int. do the formatting */ + result = format_long_internal(value, &format); + break; + + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case 'n': + case '%': + /* convert to float */ + tmp = PyNumber_Float(value); + if (tmp == NULL) + goto done; + result = format_float_internal(value, &format); + break; + + default: + /* unknown */ + PyErr_Format(PyExc_ValueError, "Unknown conversion type %c", + format.type); + goto done; + } + +done: + Py_XDECREF(tmp); + return result; +} + +PyObject * +FORMAT_FLOAT(PyObject *value, PyObject *args) +{ + PyObject *format_spec; + PyObject *result = NULL; + PyObject *tmp = NULL; + InternalFormatSpec format; + + if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) + goto done; + if (!STRINGLIB_CHECK(format_spec)) { + PyErr_SetString(PyExc_TypeError, STRINGLIB_TYPE_NAME " object required"); + goto done; + } + + /* check for the special case of zero length format spec, make + it equivalent to str(value) */ + if (STRINGLIB_LEN(format_spec) == 0) { + result = STRINGLIB_TOSTR(value); + goto done; + } + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, &format, 'g')) + goto done; + + /* type conversion? */ + switch (format.type) { +#if 0 + case 's': + /* convert to string/unicode */ + tmp = STRINGLIB_TOSTR(value); + if (tmp == NULL) + goto done; + result = format_string_internal(tmp, &format); + break; +#endif + case 'b': + case 'c': + case 'd': + case 'o': + case 'x': + case 'X': + /* convert to integer */ + tmp = PyNumber_Long(value); + if (tmp == NULL) + goto done; + result = format_long_internal(tmp, &format); + break; + + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case 'n': + case '%': + /* no conversion, already a float. do the formatting */ + result = format_float_internal(value, &format); + break; + + default: + /* unknown */ + PyErr_Format(PyExc_ValueError, "Unknown conversion type %c", + format.type); + goto done; + } + +done: + Py_XDECREF(tmp); + return result; +} diff --git a/Objects/stringlib/string_format.h b/Objects/stringlib/string_format.h new file mode 100644 index 00000000000..58032165d34 --- /dev/null +++ b/Objects/stringlib/string_format.h @@ -0,0 +1,831 @@ +/* + string_format.h -- implementation of string.format(). + + It uses the Objects/stringlib conventions, so that it can be + compiled for both unicode and string objects. +*/ + + +/* Defines for more efficiently reallocating the string buffer */ +#define INITIAL_SIZE_INCREMENT 100 +#define SIZE_MULTIPLIER 2 +#define MAX_SIZE_INCREMENT 3200 + + +/************************************************************************/ +/*********** Global data structures and forward declarations *********/ +/************************************************************************/ + +/* + A SubString consists of the characters between two string or + unicode pointers. +*/ +typedef struct { + STRINGLIB_CHAR *ptr; + STRINGLIB_CHAR *end; +} SubString; + + +/* forward declaration for recursion */ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, + int *recursion_level); + + + +/************************************************************************/ +/************************** Utility functions ************************/ +/************************************************************************/ + +/* fill in a SubString from a pointer and length */ +Py_LOCAL_INLINE(void) +SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len) +{ + str->ptr = p; + if (p == NULL) + str->end = NULL; + else + str->end = str->ptr + len; +} + +Py_LOCAL_INLINE(PyObject *) +SubString_new_object(SubString *str) +{ + return STRINGLIB_NEW(str->ptr, str->end - str->ptr); +} + +/************************************************************************/ +/*********** Error handling and exception generation **************/ +/************************************************************************/ + +/* + Most of our errors are value errors, because to Python, the + format string is a "value". Also, it's convenient to return + a NULL when we are erroring out. + + XXX: need better error handling, per PEP 3101. +*/ +static void * +SetError(const char *s) +{ + /* PyErr_Format always returns NULL */ + return PyErr_Format(PyExc_ValueError, "%s in format string", s); +} + +/* + check_input returns True if we still have characters + left in the input string. + + XXX: make this function go away when better error handling is + implemented. +*/ +Py_LOCAL_INLINE(int) +check_input(SubString *input) +{ + if (input->ptr < input->end) + return 1; + PyErr_SetString(PyExc_ValueError, + "unterminated replacement field"); + return 0; +} + +/************************************************************************/ +/*********** Output string management functions ****************/ +/************************************************************************/ + +typedef struct { + STRINGLIB_CHAR *ptr; + STRINGLIB_CHAR *end; + PyObject *obj; + Py_ssize_t size_increment; +} OutputString; + +/* initialize an OutputString object, reserving size characters */ +static int +output_initialize(OutputString *output, Py_ssize_t size) +{ + output->obj = STRINGLIB_NEW(NULL, size); + if (output->obj == NULL) + return 0; + + output->ptr = STRINGLIB_STR(output->obj); + output->end = STRINGLIB_LEN(output->obj) + output->ptr; + output->size_increment = INITIAL_SIZE_INCREMENT; + + return 1; +} + +/* + output_extend reallocates the output string buffer. + It returns a status: 0 for a failed reallocation, + 1 for success. +*/ + +static int +output_extend(OutputString *output, Py_ssize_t count) +{ + STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj); + Py_ssize_t curlen = output->ptr - startptr; + Py_ssize_t maxlen = curlen + count + output->size_increment; + + if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0) + return 0; + startptr = STRINGLIB_STR(output->obj); + output->ptr = startptr + curlen; + output->end = startptr + maxlen; + if (output->size_increment < MAX_SIZE_INCREMENT) + output->size_increment *= SIZE_MULTIPLIER; + return 1; +} + +/* + output_data dumps characters into our output string + buffer. + + In some cases, it has to reallocate the string. + + It returns a status: 0 for a failed reallocation, + 1 for success. +*/ +static int +output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count) +{ + if ((count > output->end - output->ptr) && !output_extend(output, count)) + return 0; + memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR)); + output->ptr += count; + return 1; +} + +/************************************************************************/ +/*********** Format string parsing -- integers and identifiers *********/ +/************************************************************************/ + +/* + end_identifier returns true if a character marks + the end of an identifier string. + + Although the PEP specifies that identifiers are + numbers or valid Python identifiers, we just let + getattr/getitem handle that, so the implementation + is more flexible than the PEP would indicate. +*/ +Py_LOCAL_INLINE(int) +end_identifier(STRINGLIB_CHAR c) +{ + switch (c) { + case '.': case '[': case ']': + return 1; + default: + return 0; + } +} + +/* + get_integer consumes 0 or more decimal digit characters from an + input string, updates *result with the corresponding positive + integer, and returns the number of digits consumed. + + returns -1 on error. +*/ +static int +get_integer(STRINGLIB_CHAR **ptr, STRINGLIB_CHAR *end, + Py_ssize_t *result) +{ + Py_ssize_t accumulator, digitval, oldaccumulator; + int numdigits; + accumulator = numdigits = 0; + for (;;(*ptr)++, numdigits++) { + if (*ptr >= end) + break; + digitval = STRINGLIB_TODECIMAL(**ptr); + if (digitval < 0) + break; + /* + This trick was copied from old Unicode format code. It's cute, + but would really suck on an old machine with a slow divide + implementation. Fortunately, in the normal case we do not + expect too many digits. + */ + oldaccumulator = accumulator; + accumulator *= 10; + if ((accumulator+10)/10 != oldaccumulator+1) { + PyErr_Format(PyExc_ValueError, + "Too many decimal digits in format string"); + return -1; + } + accumulator += digitval; + } + *result = accumulator; + return numdigits; +} + +/* + get_identifier is a bit of a misnomer. It returns a value for use + with getattr or getindex. This value will a string/unicode + object. The input cannot be zero length. Continues until end of + input, or end_identifier() returns true. +*/ +static PyObject * +get_identifier(SubString *input) +{ + STRINGLIB_CHAR *start; + + for (start = input->ptr; + input->ptr < input->end && !end_identifier(*input->ptr); + input->ptr++) + ; + + return STRINGLIB_NEW(start, input->ptr - start); + + /* + We might want to add code here to check for invalid Python + identifiers. All identifiers are eventually passed to getattr + or getitem, so there is a check when used. However, we might + want to remove (or not) the ability to have strings like + "a/b" or " ab" or "-1" (which is not parsed as a number). + For now, this is left as an exercise for the first disgruntled + user... + + if (XXX -- need check function) { + Py_DECREF(result); + PyErr_SetString(PyExc_ValueError, + "Invalid embedded Python identifier"); + return NULL; + } + */ +} + +/************************************************************************/ +/******** Functions to get field objects and specification strings ******/ +/************************************************************************/ + +/* get_field_and_spec is the main function in this section. It parses + the format string well enough to return a field object to render along + with a field specification string. +*/ + +/* + look up key in our keyword arguments +*/ +static PyObject * +key_lookup(PyObject *kwargs, PyObject *key) +{ + PyObject *result; + + if (kwargs && (result = PyDict_GetItem(kwargs, key)) != NULL) { + Py_INCREF(result); + return result; + } + return NULL; +} + +/* + get_field_object returns the object inside {}, before the + format_spec. It handles getindex and getattr lookups and consumes + the entire input string. +*/ +static PyObject * +get_field_object(SubString *input, PyObject *args, PyObject *kwargs) +{ + PyObject *myobj, *subobj, *newobj; + STRINGLIB_CHAR c; + Py_ssize_t index; + int isindex, isnumeric, isargument; + + index = isnumeric = 0; /* Just to shut up the compiler warnings */ + + myobj = args; + Py_INCREF(myobj); + + for (isindex=1, isargument=1;;) { + if (!check_input(input)) + break; + if (!isindex) { + if ((subobj = get_identifier(input)) == NULL) + break; + newobj = PyObject_GetAttr(myobj, subobj); + Py_DECREF(subobj); + } else { + isnumeric = (STRINGLIB_ISDECIMAL(*input->ptr)); + if (isnumeric) + /* XXX: add error checking */ + get_integer(&input->ptr, input->end, &index); + + if (isnumeric && PySequence_Check(myobj)) + newobj = PySequence_GetItem(myobj, index); + else { + /* XXX -- do we need PyLong_FromLongLong? + Using ssizet, not int... */ + subobj = isnumeric ? + PyInt_FromLong(index) : + get_identifier(input); + if (subobj == NULL) + break; + if (isargument) { + newobj = key_lookup(kwargs, subobj); + } else { + newobj = PyObject_GetItem(myobj, subobj); + } + Py_DECREF(subobj); + } + } + Py_DECREF(myobj); + myobj = newobj; + if (myobj == NULL) + break; + if (!isargument && isindex) + if ((!check_input(input)) || (*(input->ptr++) != ']')) { + SetError("Expected ]"); + break; + } + + /* if at the end of input, return with myobj */ + if (input->ptr >= input->end) + return myobj; + + c = *input->ptr; + input->ptr++; + isargument = 0; + isindex = (c == '['); + if (!isindex && (c != '.')) { + SetError("Expected ., [, :, !, or }"); + break; + } + } + if ((myobj == NULL) && isargument) { + /* XXX: include more useful error information, like which + * keyword not found or which index missing */ + PyErr_Clear(); + return SetError(isnumeric + ? "Not enough positional arguments" + : "Keyword argument not found"); + } + Py_XDECREF(myobj); + return NULL; +} + +/************************************************************************/ +/***************** Field rendering functions **************************/ +/************************************************************************/ + +/* + render_field() is the main function in this section. It takes the + field object and field specification string generated by + get_field_and_spec, and renders the field into the output string. + + format() does the actual calling of the objects __format__ method. +*/ + + +/* returns fieldobj.__format__(format_spec) */ +static PyObject * +format(PyObject *fieldobj, SubString *format_spec) +{ + static PyObject *format_str = NULL; + PyObject *meth; + PyObject *spec = NULL; + PyObject *result = NULL; + + /* Initialize cached value */ + if (format_str == NULL) { + /* Initialize static variable needed by _PyType_Lookup */ + format_str = PyUnicode_FromString("__format__"); + if (format_str == NULL) + return NULL; + } + + /* Make sure the type is initialized. float gets initialized late */ + if (Py_Type(fieldobj)->tp_dict == NULL) + if (PyType_Ready(Py_Type(fieldobj)) < 0) + return NULL; + + /* we need to create an object out of the pointers we have */ + spec = SubString_new_object(format_spec); + if (spec == NULL) + goto done; + + /* Find the (unbound!) __format__ method (a borrowed reference) */ + meth = _PyType_Lookup(Py_Type(fieldobj), format_str); + if (meth == NULL) { + PyErr_Format(PyExc_TypeError, + "Type %.100s doesn't define __format__", + Py_Type(fieldobj)->tp_name); + goto done; + } + + /* And call it, binding it to the value */ + result = PyObject_CallFunctionObjArgs(meth, fieldobj, spec, NULL); + if (result == NULL) + goto done; + + if (!STRINGLIB_CHECK(result)) { + PyErr_SetString(PyExc_TypeError, + "__format__ method did not return " + STRINGLIB_TYPE_NAME); + Py_DECREF(result); + result = NULL; + goto done; + } + +done: + Py_XDECREF(spec); + return result; +} + +/* + render_field calls fieldobj.__format__(format_spec) method, and + appends to the output. +*/ +static int +render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output) +{ + int ok = 0; + PyObject *result = format(fieldobj, format_spec); + + if (result == NULL) + goto done; + + ok = output_data(output, + STRINGLIB_STR(result), STRINGLIB_LEN(result)); +done: + Py_XDECREF(result); + return ok; +} + +static int +parse_field(SubString *str, SubString *field_name, SubString *format_spec, + STRINGLIB_CHAR *conversion) +{ + STRINGLIB_CHAR c = 0; + + /* initialize these, as they may be empty */ + *conversion = '\0'; + SubString_init(format_spec, NULL, 0); + + /* search for the field name. it's terminated by the end of the + string, or a ':' or '!' */ + field_name->ptr = str->ptr; + while (str->ptr < str->end) { + switch (c = *(str->ptr++)) { + case ':': + case '!': + break; + default: + continue; + } + break; + } + + if (c == '!' || c == ':') { + /* we have a format specifier and/or a conversion */ + /* don't include the last character */ + field_name->end = str->ptr-1; + + /* the format specifier is the rest of the string */ + format_spec->ptr = str->ptr; + format_spec->end = str->end; + + /* see if there's a conversion specifier */ + if (c == '!') { + /* there must be another character present */ + if (format_spec->ptr >= format_spec->end) { + PyErr_SetString(PyExc_ValueError, + "end of format while looking for conversion " + "specifier"); + return 0; + } + *conversion = *(format_spec->ptr++); + + /* if there is another character, it must be a colon */ + if (format_spec->ptr < format_spec->end) { + c = *(format_spec->ptr++); + if (c != ':') { + PyErr_SetString(PyExc_ValueError, + "expected ':' after format specifier"); + return 0; + } + } + } + + return 1; + + } else { + /* end of string, there's no format_spec or conversion */ + field_name->end = str->ptr; + return 1; + } +} + +/************************************************************************/ +/******* Output string allocation and escape-to-markup processing ******/ +/************************************************************************/ + +/* MarkupIterator breaks the string into pieces of either literal + text, or things inside {} that need to be marked up. it is + designed to make it easy to wrap a Python iterator around it, for + use with the Formatter class */ + +typedef struct { + SubString str; + int in_markup; +} MarkupIterator; + +static int +MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len) +{ + SubString_init(&self->str, ptr, len); + self->in_markup = 0; + return 1; +} + +/* returns 0 on error, 1 on non-error termination, and 2 if it got a + string (or something to be expanded) */ +static int +MarkupIterator_next(MarkupIterator *self, int *is_markup, SubString *literal, + SubString *field_name, SubString *format_spec, + STRINGLIB_CHAR *conversion, + int *format_spec_needs_expanding) +{ + int at_end; + STRINGLIB_CHAR c = 0; + STRINGLIB_CHAR *start; + int count; + Py_ssize_t len; + + *format_spec_needs_expanding = 0; + + /* no more input, end of iterator */ + if (self->str.ptr >= self->str.end) + return 1; + + *is_markup = self->in_markup; + start = self->str.ptr; + + if (self->in_markup) { + + /* prepare for next iteration */ + self->in_markup = 0; + + /* this is markup, find the end of the string by counting nested + braces. note that this prohibits escaped braces, so that + format_specs cannot have braces in them. */ + count = 1; + + /* we know we can't have a zero length string, so don't worry + about that case */ + while (self->str.ptr < self->str.end) { + switch (c = *(self->str.ptr++)) { + case '{': + /* the format spec needs to be recursively expanded. + this is an optimization, and not strictly needed */ + *format_spec_needs_expanding = 1; + count++; + break; + case '}': + count--; + if (count <= 0) { + /* we're done. parse and get out */ + literal->ptr = start; + literal->end = self->str.ptr-1; + + if (parse_field(literal, field_name, format_spec, + conversion) == 0) + return 0; + + /* success */ + return 2; + } + break; + } + } + /* end of string while searching for matching '}' */ + PyErr_SetString(PyExc_ValueError, "unmatched '{' in format"); + return 0; + + } else { + /* literal text, read until the end of string, an escaped { or }, + or an unescaped { */ + while (self->str.ptr < self->str.end) { + switch (c = *(self->str.ptr++)) { + case '{': + case '}': + self->in_markup = 1; + break; + default: + continue; + } + break; + } + + at_end = self->str.ptr >= self->str.end; + len = self->str.ptr - start; + + if ((c == '}') && (at_end || (c != *self->str.ptr))) + return (int)SetError("Single } encountered"); + if (at_end && c == '{') + return (int)SetError("Single { encountered"); + if (!at_end) { + if (c == *self->str.ptr) { + /* escaped } or {, skip it in the input */ + self->str.ptr++; + self->in_markup = 0; + } else + len--; + } + + /* this is just plain text, return it */ + literal->ptr = start; + literal->end = start + len; + return 2; + } +} + + +/* do the !r or !s conversion on obj */ +static PyObject * +do_conversion(PyObject *obj, STRINGLIB_CHAR conversion) +{ + /* XXX in pre-3.0, do we need to convert this to unicode, since it + might have returned a string? */ + switch (conversion) { + case 'r': + return PyObject_Repr(obj); + case 's': + return PyObject_Unicode(obj); + default: + PyErr_Format(PyExc_ValueError, + "Unknown converion specifier %c", + conversion); + return NULL; + } +} + +/* given: + + {field_name!conversion:format_spec} + + compute the result and write it to output. + format_spec_needs_expanding is an optimization. if it's false, + just output the string directly, otherwise recursively expand the + format_spec string. */ + +static int +output_markup(SubString *field_name, SubString *format_spec, + int format_spec_needs_expanding, STRINGLIB_CHAR conversion, + OutputString *output, PyObject *args, PyObject *kwargs, + int *recursion_level) +{ + PyObject *tmp = NULL; + PyObject *fieldobj = NULL; + SubString expanded_format_spec; + SubString *actual_format_spec; + int result = 0; + + /* convert field_name to an object */ + fieldobj = get_field_object(field_name, args, kwargs); + if (fieldobj == NULL) + goto done; + + if (conversion != '\0') { + tmp = do_conversion(fieldobj, conversion); + if (tmp == NULL) + goto done; + + /* do the assignment, transferring ownership: fieldobj = tmp */ + Py_DECREF(fieldobj); + fieldobj = tmp; + tmp = NULL; + } + + /* if needed, recurively compute the format_spec */ + if (format_spec_needs_expanding) { + tmp = build_string(format_spec, args, kwargs, recursion_level); + if (tmp == NULL) + goto done; + + /* note that in the case we're expanding the format string, + tmp must be kept around until after the call to + render_field. */ + SubString_init(&expanded_format_spec, + STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp)); + actual_format_spec = &expanded_format_spec; + } else + actual_format_spec = format_spec; + + if (render_field(fieldobj, actual_format_spec, output) == 0) + goto done; + + result = 1; + +done: + Py_XDECREF(fieldobj); + Py_XDECREF(tmp); + + return result; +} + +/* + do_markup is the top-level loop for the format() function. It + searches through the format string for escapes to markup codes, and + calls other functions to move non-markup text to the output, + and to perform the markup to the output. +*/ +static int +do_markup(SubString *input, PyObject *args, PyObject *kwargs, + OutputString *output, int *recursion_level) +{ + MarkupIterator iter; + int is_markup; + int format_spec_needs_expanding; + int result; + SubString str; + SubString field_name; + SubString format_spec; + STRINGLIB_CHAR conversion; + + MarkupIterator_init(&iter, input->ptr, input->end - input->ptr); + while ((result = MarkupIterator_next(&iter, &is_markup, &str, &field_name, + &format_spec, &conversion, + &format_spec_needs_expanding)) == 2) { + if (is_markup) { + if (!output_markup(&field_name, &format_spec, + format_spec_needs_expanding, conversion, output, + args, kwargs, recursion_level)) + return 0; + } else { + if (!output_data(output, str.ptr, str.end-str.ptr)) + return 0; + } + } + return result; +} + + +/* + build_string allocates the output string and then + calls do_markup to do the heavy lifting. +*/ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, + int *recursion_level) +{ + OutputString output; + PyObject *result = NULL; + Py_ssize_t count; + + output.obj = NULL; /* needed so cleanup code always works */ + + /* check the recursion level */ + (*recursion_level)--; + if (*recursion_level < 0) { + PyErr_SetString(PyExc_ValueError, + "Max string recursion exceeded"); + goto done; + } + + /* initial size is the length of the format string, plus the size + increment. seems like a reasonable default */ + if (!output_initialize(&output, + input->end - input->ptr + + INITIAL_SIZE_INCREMENT)) + goto done; + + if (!do_markup(input, args, kwargs, &output, recursion_level)) { + goto done; + } + + count = output.ptr - STRINGLIB_STR(output.obj); + if (STRINGLIB_RESIZE(&output.obj, count) < 0) { + goto done; + } + + /* transfer ownership to result */ + result = output.obj; + output.obj = NULL; + +done: + (*recursion_level)++; + Py_XDECREF(output.obj); + return result; +} + +/************************************************************************/ +/*********** main routine ***********************************************/ +/************************************************************************/ + +/* this is the main entry point */ +static PyObject * +do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) +{ + SubString input; + + /* PEP 3101 says only 2 levels, so that + "{0:{1}}".format('abc', 's') # works + "{0:{1:{2}}}".format('abc', 's', '') # fails + */ + int recursion_level = 2; + + SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self)); + return build_string(&input, args, kwargs, &recursion_level); +} diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h new file mode 100644 index 00000000000..af9bbd6fd32 --- /dev/null +++ b/Objects/stringlib/stringdefs.h @@ -0,0 +1,23 @@ +#ifndef STRINGLIB_STRINGDEFS_H +#define STRINGLIB_STRINGDEFS_H + +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 0 + +#define STRINGLIB_CHAR char +#define STRINGLIB_TYPE_NAME "string" +#define STRINGLIB_EMPTY string_empty +#define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9')) +#define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1) +#define STRINGLIB_FILL memset +#define STRINGLIB_STR PyString_AS_STRING +#define STRINGLIB_LEN PyString_GET_SIZE +#define STRINGLIB_NEW PyString_FromStringAndSize +#define STRINGLIB_RESIZE _PyString_Resize +#define STRINGLIB_CHECK PyString_Check +#define STRINGLIB_CMP memcmp +#define STRINGLIB_TOSTR PyObject_Str + +#endif /* !STRINGLIB_STRINGDEFS_H */ diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h new file mode 100644 index 00000000000..1fac2c3d529 --- /dev/null +++ b/Objects/stringlib/unicodedefs.h @@ -0,0 +1,32 @@ +#ifndef STRINGLIB_UNICODEDEFS_H +#define STRINGLIB_UNICODEDEFS_H + +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 1 + +#define STRINGLIB_CHAR Py_UNICODE +#define STRINGLIB_TYPE_NAME "unicode" +#define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL +#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL +#define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER +#define STRINGLIB_TOLOWER Py_UNICODE_TOLOWER +#define STRINGLIB_FILL Py_UNICODE_FILL +#define STRINGLIB_STR PyUnicode_AS_UNICODE +#define STRINGLIB_LEN PyUnicode_GET_SIZE +#define STRINGLIB_NEW PyUnicode_FromUnicode +#define STRINGLIB_RESIZE PyUnicode_Resize +#define STRINGLIB_CHECK PyUnicode_Check +#define STRINGLIB_TOSTR PyObject_Unicode + +Py_LOCAL_INLINE(int) +STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) +{ + if (str[0] != other[0]) + return 1; + return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE)); +} + +#endif /* !STRINGLIB_UNICODEDEFS_H */ diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 222207ca2b9..4e5e09db61e 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -2933,11 +2933,52 @@ object_reduce_ex(PyObject *self, PyObject *args) return _common_reduce(self, proto); } + +/* + from PEP 3101, this code implements: + + class object: + def __format__(self, format_spec): + return format(str(self), format_spec) +*/ +static PyObject * +object_format(PyObject *self, PyObject *args) +{ + PyObject *format_spec; + PyObject *self_as_str = NULL; + PyObject *result = NULL; + PyObject *format_meth = NULL; + + if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) + return NULL; + if (!PyUnicode_Check(format_spec)) { + PyErr_SetString(PyExc_TypeError, "Unicode object required"); + return NULL; + } + + self_as_str = PyObject_Unicode(self); + if (self_as_str != NULL) { + /* find the format function */ + format_meth = PyObject_GetAttrString(self_as_str, "__format__"); + if (format_meth != NULL) { + /* and call it */ + result = PyObject_CallFunctionObjArgs(format_meth, format_spec, NULL); + } + } + + Py_XDECREF(self_as_str); + Py_XDECREF(format_meth); + + return result; +} + static PyMethodDef object_methods[] = { {"__reduce_ex__", object_reduce_ex, METH_VARARGS, PyDoc_STR("helper for pickle")}, {"__reduce__", object_reduce, METH_VARARGS, PyDoc_STR("helper for pickle")}, + {"__format__", object_format, METH_VARARGS, + PyDoc_STR("default object formatter")}, {0} }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e227fc72d27..3052ebd4c91 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -45,6 +45,8 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "unicodeobject.h" #include "ucnhash.h" +#include "formatter_unicode.h" + #ifdef MS_WINDOWS #include #endif @@ -5009,21 +5011,7 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, /* --- Helpers ------------------------------------------------------------ */ -#define STRINGLIB_CHAR Py_UNICODE - -#define STRINGLIB_LEN PyUnicode_GET_SIZE -#define STRINGLIB_NEW PyUnicode_FromUnicode -#define STRINGLIB_STR PyUnicode_AS_UNICODE - -Py_LOCAL_INLINE(int) -STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) -{ - if (str[0] != other[0]) - return 1; - return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE)); -} - -#define STRINGLIB_EMPTY unicode_empty +#include "stringlib/unicodedefs.h" #include "stringlib/fastsearch.h" @@ -7964,6 +7952,33 @@ unicode_endswith(PyUnicodeObject *self, return PyBool_FromLong(result); } +#include "stringlib/string_format.h" + +PyDoc_STRVAR(format__doc__, +"S.format(*args, **kwargs) -> unicode\n\ +\n\ +"); + +static PyObject * +unicode_format(PyObject *self, PyObject *args, PyObject *kwds) +{ + /* this calls into stringlib/string_format.h because it can be + included for either string or unicode. this is needed for + python 2.6. */ + return do_string_format(self, args, kwds); +} + + +PyDoc_STRVAR(p_format__doc__, +"S.__format__(format_spec) -> unicode\n\ +\n\ +"); + +static PyObject * +unicode__format__(PyObject *self, PyObject *args) +{ + return unicode_unicode__format__(self, args); +} static PyObject * @@ -8019,6 +8034,8 @@ static PyMethodDef unicode_methods[] = { {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, + {"format", (PyCFunction) unicode_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, + {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, #if 0 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, #endif @@ -9124,6 +9141,205 @@ void _Py_ReleaseInternedUnicodeStrings(void) } +/********************* Formatter Iterator ************************/ + +/* this is used to implement string.Formatter.vparse(). it exists so + Formatter can share code with the built in unicode.format() + method */ + +typedef struct { + PyObject_HEAD + + /* we know this to be a unicode object, but since we just keep + it around to keep the object alive, having it as PyObject + is okay */ + PyObject *str; + + MarkupIterator it_markup; +} formatteriterobject; + +static void +formatteriter_dealloc(formatteriterobject *it) +{ + _PyObject_GC_UNTRACK(it); + Py_XDECREF(it->str); + PyObject_GC_Del(it); +} + +/* returns a tuple: + (is_markup, literal, field_name, format_spec, conversion) + if is_markup == True: + literal is None + field_name is the string before the ':' + format_spec is the string after the ':' + conversion is either None, or the string after the '!' + if is_markup == False: + literal is the literal string + field_name is None + format_spec is None + conversion is None +*/ +static PyObject * +formatteriter_next(formatteriterobject *it) +{ + SubString literal; + SubString field_name; + SubString format_spec; + Py_UNICODE conversion; + int is_markup; + int format_spec_needs_expanding; + int result = MarkupIterator_next(&it->it_markup, &is_markup, &literal, + &field_name, &format_spec, &conversion, + &format_spec_needs_expanding); + + /* all of the SubString objects point into it->str, so no + memory management needs to be done on them */ + + if (result == 0) { + /* error has already been set */ + return NULL; + } else if (result == 1) { + /* end of iterator */ + return NULL; + } else { + PyObject *is_markup_bool = NULL; + PyObject *literal_str = NULL; + PyObject *field_name_str = NULL; + PyObject *format_spec_str = NULL; + PyObject *conversion_str = NULL; + PyObject *result = NULL; + + assert(result == 2); + + is_markup_bool = PyBool_FromLong(is_markup); + if (!is_markup_bool) + goto error; + + if (is_markup) { + /* field_name, format_spec, and conversion are + returned */ + literal_str = Py_None; + Py_INCREF(literal_str); + + field_name_str = SubString_new_object(&field_name); + if (field_name_str == NULL) + goto error; + + format_spec_str = SubString_new_object(&format_spec); + if (format_spec_str == NULL) + goto error; + + /* if the conversion is not specified, return + a None, otherwise create a one length + string with the conversion characater */ + if (conversion == '\0') { + conversion_str = Py_None; + Py_INCREF(conversion_str); + } else + conversion_str = PyUnicode_FromUnicode(&conversion, + 1); + if (conversion_str == NULL) + goto error; + } else { + /* only literal is returned */ + literal_str = SubString_new_object(&literal); + if (literal_str == NULL) + goto error; + + field_name_str = Py_None; + format_spec_str = Py_None; + conversion_str = Py_None; + + Py_INCREF(field_name_str); + Py_INCREF(format_spec_str); + Py_INCREF(conversion_str); + } + /* return a tuple of values */ + result = PyTuple_Pack(5, is_markup_bool, literal_str, + field_name_str, format_spec_str, + conversion_str); + if (result == NULL) + goto error; + + return result; + error: + Py_XDECREF(is_markup_bool); + Py_XDECREF(literal_str); + Py_XDECREF(field_name_str); + Py_XDECREF(format_spec_str); + Py_XDECREF(conversion_str); + Py_XDECREF(result); + return NULL; + } +} + +static PyMethodDef formatteriter_methods[] = { + {NULL, NULL} /* sentinel */ +}; + +PyTypeObject PyFormatterIter_Type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) + "formatteriterator", /* tp_name */ + sizeof(formatteriterobject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)formatteriter_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc)formatteriter_next, /* tp_iternext */ + formatteriter_methods, /* tp_methods */ + 0, +}; + +PyObject * +_unicodeformatter_iterator(PyObject *str) +{ + formatteriterobject *it; + + it = PyObject_GC_New(formatteriterobject, &PyFormatterIter_Type); + if (it == NULL) + return NULL; + + /* take ownership, give the object to the iterator */ + Py_INCREF(str); + it->str = str; + + /* initialize the contained MarkupIterator */ + MarkupIterator_init(&it->it_markup, + PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str)); + + _PyObject_GC_TRACK(it); + return (PyObject *)it; +} + +PyObject * +_unicodeformatter_lookup(PyObject *field_name, PyObject *args, + PyObject *kwargs) +{ + return NULL; +} + + /********************* Unicode Iterator **************************/ typedef struct { diff --git a/Python/Python-ast.c b/Python/Python-ast.c index 605a152ca97..4c6f42c120b 100644 --- a/Python/Python-ast.c +++ b/Python/Python-ast.c @@ -2,7 +2,7 @@ /* - __version__ 56266. + __version__ . This module must be committed separately after each AST grammar change; The __version__ number is set to the revision number of the commit @@ -3179,7 +3179,7 @@ init_ast(void) if (PyDict_SetItemString(d, "AST", (PyObject*)AST_type) < 0) return; if (PyModule_AddIntConstant(m, "PyCF_ONLY_AST", PyCF_ONLY_AST) < 0) return; - if (PyModule_AddStringConstant(m, "__version__", "56266") < 0) + if (PyModule_AddStringConstant(m, "__version__", "") < 0) return; if (PyDict_SetItemString(d, "mod", (PyObject*)mod_type) < 0) return; if (PyDict_SetItemString(d, "Module", (PyObject*)Module_type) < 0) diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c index d087e9c786d..17f5b596fcf 100644 --- a/Python/bltinmodule.c +++ b/Python/bltinmodule.c @@ -275,6 +275,61 @@ for which the predicate (a Boolean function) returns true.\n\ If the predicate is None, 'lambda x: bool(x)' is assumed.\n\ (This is identical to itertools.ifilter().)"); +static PyObject * +builtin_format(PyObject *self, PyObject *args) +{ + static PyObject * format_str = NULL; + PyObject *value; + PyObject *spec; + PyObject *meth; + PyObject *result; + + /* Initialize cached value */ + if (format_str == NULL) { + /* Initialize static variable needed by _PyType_Lookup */ + format_str = PyUnicode_FromString("__format__"); + if (format_str == NULL) + return NULL; + } + + if (!PyArg_ParseTuple(args, "OO:format", &value, &spec)) + return NULL; + + /* Make sure the type is initialized. float gets initialized late */ + if (Py_Type(value)->tp_dict == NULL) + if (PyType_Ready(Py_Type(value)) < 0) + return NULL; + + /* Find the (unbound!) __format__ method (a borrowed reference) */ + meth = _PyType_Lookup(Py_Type(value), format_str); + if (meth == NULL) { + PyErr_Format(PyExc_TypeError, + "Type %.100s doesn't define __format__", + Py_Type(value)->tp_name); + return NULL; + } + + /* And call it, binding it to the value */ + result = PyObject_CallFunctionObjArgs(meth, value, spec, NULL); + +#if 0 + /* XXX this is segfaulting, not sure why. find out later! */ + if (!PyUnicode_Check(result)) { + PyErr_SetString(PyExc_TypeError, + "__format__ method did not return string"); + Py_DECREF(result); + return NULL; + } +#endif + + return result; +} + + +PyDoc_STRVAR(format_doc, +"format(value, format_spec) -> string\n\ +\n\ +Returns value.__format__(format_spec)."); static PyObject * builtin_chr8(PyObject *self, PyObject *args) @@ -1676,6 +1731,7 @@ static PyMethodDef builtin_methods[] = { {"eval", builtin_eval, METH_VARARGS, eval_doc}, {"exec", builtin_exec, METH_VARARGS, exec_doc}, {"filter", builtin_filter, METH_VARARGS, filter_doc}, + {"format", builtin_format, METH_VARARGS, format_doc}, {"getattr", builtin_getattr, METH_VARARGS, getattr_doc}, {"globals", (PyCFunction)builtin_globals, METH_NOARGS, globals_doc}, {"hasattr", builtin_hasattr, METH_VARARGS, hasattr_doc}, diff --git a/Python/formatter_unicode.c b/Python/formatter_unicode.c new file mode 100644 index 00000000000..114fe3016b8 --- /dev/null +++ b/Python/formatter_unicode.c @@ -0,0 +1,13 @@ +/* implements the unicode (as opposed to string) version of the + built-in formatters for string, int, float. that is, the versions + of int.__float__, etc., that take and return unicode objects */ + +#include "Python.h" +#include "formatter_unicode.h" + +#include "../Objects/stringlib/unicodedefs.h" + +#define FORMAT_STRING unicode_unicode__format__ +#define FORMAT_LONG unicode_long__format__ +#define FORMAT_FLOAT unicode_float__format__ +#include "../Objects/stringlib/formatter.h" diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 6ccd3e9a3e9..10a74270c09 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -660,6 +660,54 @@ sys_current_frames(PyObject *self, PyObject *noargs) return _PyThread_CurrentFrames(); } +/* sys_formatter_iterator is used to implement + string.Formatter.vformat. it parses a string and returns tuples + describing the parsed elements. see unicodeobject.c's + _unicodeformatter_iterator for details */ +static PyObject * +sys_formatter_iterator(PyObject *self, PyObject *args) +{ + /* in 2.6, check type and dispatch to unicode or string + accordingly */ + PyObject *str; + + if (!PyArg_ParseTuple(args, "O:_formatter_iterator", &str)) + return NULL; + + if (!PyUnicode_Check(str)) { + PyErr_SetString(PyExc_TypeError, + "_formatter_iterator expects unicode object"); + return NULL; + } + + return _unicodeformatter_iterator(str); +} + +/* sys_formatter_lookup is used to implement string.Formatter.vformat. + it takes an PEP 3101 "field name", args, and kwargs, and returns a + tuple (index, name, object). see unicodeobject.c's + _unicodeformatter_lookup for details */ +static PyObject * +sys_formatter_lookup(PyObject *self, PyObject *args) +{ + PyObject *field_name; + PyObject *arg_args; + PyObject *kwargs; + + if (!PyArg_ParseTuple(args, "OOO:_formatter_lookup", &field_name, + &arg_args, &kwargs)) + return NULL; + + if (!PyUnicode_Check(field_name)) { + PyErr_SetString(PyExc_TypeError, + "_formatter_lookup expects unicode object"); + return NULL; + } + + return _unicodeformatter_lookup(field_name, arg_args, kwargs); +} + + PyDoc_STRVAR(call_tracing_doc, "call_tracing(func, args) -> object\n\ \n\ @@ -724,6 +772,8 @@ static PyMethodDef sys_methods[] = { callstats_doc}, {"_current_frames", sys_current_frames, METH_NOARGS, current_frames_doc}, + {"_formatter_parser", sys_formatter_iterator, METH_VARARGS}, + {"_formatter_lookup", sys_formatter_lookup, METH_VARARGS}, {"displayhook", sys_displayhook, METH_O, displayhook_doc}, {"exc_info", sys_exc_info, METH_NOARGS, exc_info_doc}, {"excepthook", sys_excepthook, METH_VARARGS, excepthook_doc},