SF #941229: Decode source code with sys.stdin.encoding in interactive

modes like non-interactive modes. This allows for non-latin-1 users to write unicode strings directly and sets Japanese users free from weird manual escaping <wink> in shift_jis environments. (Reviewed by Martin v. Loewis)
2024-09-18 20:01:39 +00:00 · 2004-08-04 17:36:41 +00:00 · 2004-08-04 17:36:41 +00:00 · 7df44b384a
parent 5910d81c97
commit 7df44b384a
2 changed files with 65 additions and 0 deletions
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -70,6 +70,10 @@ Core and builtins
 - unicode.iswide() and unicode.width() is dropped and the East Asian
  Width support is moved to unicodedata extension module.
 - Patch #941229: The source code encoding in interactive mode
  now refers sys.stdin.encoding not just ISO-8859-1 anymore.  This
  allows for non-latin-1 users to write unicode strings directly.
 Extension modules
 -----------------
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -651,6 +651,63 @@ PyTokenizer_Free(struct tok_state *tok)
 	PyMem_DEL(tok);
 }
 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 static int
 tok_stdin_decode(struct tok_state *tok, char **inp)
 {
 	PyObject *enc, *sysstdin, *decoded, *utf8;
 	const char *encoding;
 	char *converted;
 	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 		return 0;
 	sysstdin = PySys_GetObject("stdin");
 	if (sysstdin == NULL || !PyFile_Check(sysstdin))
 		return 0;
 	enc = ((PyFileObject *)sysstdin)->f_encoding;
 	if (enc == NULL || !PyString_Check(enc))
 		return 0;
 	Py_INCREF(enc);
 	encoding = PyString_AsString(enc);
 	decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 	if (decoded == NULL)
 		goto error_clear;
 	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 	Py_DECREF(decoded);
 	if (utf8 == NULL)
 		goto error_clear;
 	converted = new_string(PyString_AsString(utf8), PyString_Size(utf8));
 	Py_DECREF(utf8);
 	if (converted == NULL)
 		goto error_nomem;
 	PyMem_FREE(*inp);
 	*inp = converted;
 	if (tok->encoding != NULL)
 		PyMem_DEL(tok->encoding);
 	tok->encoding = new_string(encoding, strlen(encoding));
 	if (tok->encoding == NULL)
 		goto error_nomem;
 	Py_DECREF(enc);
 	return 0;
 error_nomem:
 	Py_DECREF(enc);
 	tok->done = E_NOMEM;
 	return -1;
 error_clear:
 	/* Fallback to iso-8859-1: for backward compatibility */
 	Py_DECREF(enc);
 	PyErr_Clear();
 	return 0;
 }
 #endif
 /* Get next char, updating state; error code goes into tok->done */
@ -690,6 +747,10 @@ tok_nextc(register struct tok_state *tok)
 				PyMem_FREE(new);
 				tok->done = E_EOF;
 			}
 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 			else if (tok_stdin_decode(tok, &new) != 0)
 				PyMem_FREE(new);
 #endif
 			else if (tok->start != NULL) {
 				size_t start = tok->start - tok->buf;
 				size_t oldlen = tok->cur - tok->buf;