commit 367f8568bc9e759ebdfb423648891efa0346456b (HEAD, refs/remotes/origin/master) Author: Simen Heggestøyl Date: Fri Sep 9 18:46:55 2016 +0200 * lisp/emacs-lisp/ring.el: Use lexical-binding * lisp/emacs-lisp/ring.el (ring-elements): Don't use the RESULT argument of `dotimes' when the iteration variable isn't referred by it. (ring-member): Don't pass nil as the RESULT argument of `dotimes' since it's the default. diff --git a/lisp/emacs-lisp/ring.el b/lisp/emacs-lisp/ring.el index b1b6626..c6684ec 100644 --- a/lisp/emacs-lisp/ring.el +++ b/lisp/emacs-lisp/ring.el @@ -1,4 +1,4 @@ -;;; ring.el --- handle rings of items +;;; ring.el --- handle rings of items -*- lexical-binding: t; -*- ;; Copyright (C) 1992, 2001-2016 Free Software Foundation, Inc. @@ -160,14 +160,15 @@ will be performed." (size (ring-size ring)) (vect (cddr ring)) lst) - (dotimes (var (cadr ring) lst) - (push (aref vect (mod (+ start var) size)) lst)))) + (dotimes (var (cadr ring)) + (push (aref vect (mod (+ start var) size)) lst)) + lst)) (defun ring-member (ring item) "Return index of ITEM if on RING, else nil. Comparison is done via `equal'. The index is 0-based." (catch 'found - (dotimes (ind (ring-length ring) nil) + (dotimes (ind (ring-length ring)) (when (equal item (ring-ref ring ind)) (throw 'found ind))))) diff --git a/test/lisp/emacs-lisp/ring-tests.el b/test/lisp/emacs-lisp/ring-tests.el index 705bfe5..affde89 100644 --- a/test/lisp/emacs-lisp/ring-tests.el +++ b/test/lisp/emacs-lisp/ring-tests.el @@ -5,6 +5,8 @@ ;; Author: Simen Heggestøyl ;; Keywords: +;; This file is part of GNU Emacs. + ;; GNU Emacs is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or commit 8634efa38179f44c2cb5c52c25ced3f02fa5ec1a Author: Michal Nazarewicz Date: Wed Aug 17 19:53:01 2016 +0200 Split regex character class test into smaller chunks Having one test for all character classes it is not always trivial to determine which class is failing. This happens when failure is caused by ‘(should (equal (point) (point-max)))’ not being met. With per-character class tests, it is immidiatelly obvious which test causes issues plus tests for all classes are run even if some of them fail. * test/src/regex-tests.el (regex-character-classes): Delete and split into… (regex-tests-alnum-character-class, regex-tests-alpha-character-class, regex-tests-ascii-character-class, regex-tests-blank-character-class, regex-tests-cntrl-character-class, regex-tests-digit-character-class, regex-tests-graph-character-class, regex-tests-lower-character-class, regex-tests-multibyte-character-class, regex-tests-nonascii-character-class, regex-tests-print-character-class, regex-tests-punct-character-class, regex-tests-space-character-class, regex-tests-unibyte-character-class, regex-tests-upper-character-class, regex-tests-word-character-class, regex-tests-xdigit-character-class): …new tests. diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el index 6e21088..c4844c7 100644 --- a/test/src/regex-tests.el +++ b/test/src/regex-tests.el @@ -45,54 +45,56 @@ character) must match a string \"\u2420\"." (concat string suffix))))))))) (defun regex--test-cc (name matching not-matching) - (should (string-match-p (concat "^[[:" name ":]]*$") matching)) - (should (string-match-p (concat "^[[:" name ":]]*?\u2622$") - (concat matching "\u2622"))) - (should (string-match-p (concat "^[^[:" name ":]]*$") not-matching)) - (should (string-match-p (concat "^[^[:" name ":]]*\u2622$") - (concat not-matching "\u2622"))) - (with-temp-buffer - (insert matching) - (let ((p (point))) - (insert not-matching) - (goto-char (point-min)) - (skip-chars-forward (concat "[:" name ":]")) - (should (equal (point) p)) - (skip-chars-forward (concat "^[:" name ":]")) - (should (equal (point) (point-max))) - (goto-char (point-min)) - (skip-chars-forward (concat "[:" name ":]\u2622")) - (should (or (equal (point) p) (equal (point) (1+ p))))))) - -(ert-deftest regex-character-classes () - "Perform sanity test of regexes using character classes. + (let (case-fold-search) + (should (string-match-p (concat "^[[:" name ":]]*$") matching)) + (should (string-match-p (concat "^[[:" name ":]]*?\u2622$") + (concat matching "\u2622"))) + (should (string-match-p (concat "^[^[:" name ":]]*$") not-matching)) + (should (string-match-p (concat "^[^[:" name ":]]*\u2622$") + (concat not-matching "\u2622"))) + (with-temp-buffer + (insert matching) + (let ((p (point))) + (insert not-matching) + (goto-char (point-min)) + (skip-chars-forward (concat "[:" name ":]")) + (should (equal (point) p)) + (skip-chars-forward (concat "^[:" name ":]")) + (should (equal (point) (point-max))) + (goto-char (point-min)) + (skip-chars-forward (concat "[:" name ":]\u2622")) + (should (or (equal (point) p) (equal (point) (1+ p)))))))) + +(dolist (test '(("alnum" "abcABC012łąka" "-, \t\n") + ("alpha" "abcABCłąka" "-,012 \t\n") + ("digit" "012" "abcABCłąka-, \t\n") + ("xdigit" "0123aBc" "łąk-, \t\n") + ("upper" "ABCŁĄKA" "abc012-, \t\n") + ("lower" "abcłąka" "ABC012-, \t\n") + + ("word" "abcABC012\u2620" "-, \t\n") + + ("punct" ".,-" "abcABC012\u2620 \t\n") + ("cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ") + ("graph" "abcłąka\u2620-," " \t\n\1") + ("print" "abcłąka\u2620-, " "\t\n\1") + + ("space" " \t\n\u2001" "abcABCł0123") + ("blank" " \t" "\n\u2001") + + ("ascii" "abcABC012 \t\n\1" "łą\u2620") + ("nonascii" "łą\u2622" "abcABC012 \t\n\1") + ("unibyte" "abcABC012 \t\n\1" "łą\u2622") + ("multibyte" "łą\u2622" "abcABC012 \t\n\1"))) + (let ((name (intern (concat "regex-tests-" (car test) "-character-class"))) + (doc (concat "Perform sanity test of regexes using " (car test) + " character class. Go over all the supported character classes and test whether the classes and their inversions match what they are supposed to match. The test is done using `string-match-p' as well as -`skip-chars-forward'." - (let (case-fold-search) - (regex--test-cc "alnum" "abcABC012łąka" "-, \t\n") - (regex--test-cc "alpha" "abcABCłąka" "-,012 \t\n") - (regex--test-cc "digit" "012" "abcABCłąka-, \t\n") - (regex--test-cc "xdigit" "0123aBc" "łąk-, \t\n") - (regex--test-cc "upper" "ABCŁĄKA" "abc012-, \t\n") - (regex--test-cc "lower" "abcłąka" "ABC012-, \t\n") - - (regex--test-cc "word" "abcABC012\u2620" "-, \t\n") - - (regex--test-cc "punct" ".,-" "abcABC012\u2620 \t\n") - (regex--test-cc "cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ") - (regex--test-cc "graph" "abcłąka\u2620-," " \t\n\1") - (regex--test-cc "print" "abcłąka\u2620-, " "\t\n\1") - - (regex--test-cc "space" " \t\n\u2001" "abcABCł0123") - (regex--test-cc "blank" " \t" "\n\u2001") - - (regex--test-cc "ascii" "abcABC012 \t\n\1" "łą\u2620") - (regex--test-cc "nonascii" "łą\u2622" "abcABC012 \t\n\1") - (regex--test-cc "unibyte" "abcABC012 \t\n\1" "łą\u2622") - (regex--test-cc "multibyte" "łą\u2622" "abcABC012 \t\n\1"))) +`skip-chars-forward'."))) + (eval `(ert-deftest ,name () ,doc ,(cons 'regex--test-cc test)) t))) (defmacro regex-tests-generic-line (comment-char test-file whitelist &rest body) commit 4516130d5a4bec47e86bdf560a1375740b6bb110 Author: Michal Nazarewicz Date: Mon Sep 5 20:01:23 2016 +0200 Don’t allocate char-table’s extra slots in regexp-out-charset * lisp/emacs-lisp/regexp-opt.el (regexp-opt-charset): Do not use 'case-table as charmap char-table’s property. The function has nothing to do with casing and in addition using 'case-table causes unnecessary extra slots to be allocated which ‘regexp-opt-charset’ does not use. diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el index b1e132a..cf66530 100644 --- a/lisp/emacs-lisp/regexp-opt.el +++ b/lisp/emacs-lisp/regexp-opt.el @@ -236,7 +236,7 @@ CHARS should be a list of characters." ;; The basic idea is to find character ranges. Also we take care in the ;; position of character set meta characters in the character set regexp. ;; - (let* ((charmap (make-char-table 'case-table)) + (let* ((charmap (make-char-table 'regexp-opt-charset)) (start -1) (end -2) (charset "") (bracket "") (dash "") (caret "")) commit 0e7eb64076c17b3252249aa2a3ef340ce9f395bb Author: Michal Nazarewicz Date: Wed Aug 3 03:52:49 2016 +0200 Remove dead loop iterations in regex.c RE_CHAR_TO_MULTIBYTE(c) yields c for ASCII characters and a byte8 character for c ≥ 0x80. Furthermore, CHAR_BYTE8_P(c) is true only for byte8 characters. This means that c = RE_CHAR_TO_MULTIBYTE (ch); if (! CHAR_BYTE8_P (c) && re_iswctype (c, cc)) is equivalent to: c = c; if (! false && re_iswctype (c, cc)) for 0 ⪬ c < 0x80, and c = BYTE8_TO_CHAR (c); if (! true && re_iswctype (c, cc)) for 0x80 ⪬ c < 0x100. In other words, the loop never executes for c ≥ 0x80 and RE_CHAR_TO_MULTIBYTE call is unnecessary for c < 0x80. * src/regex.c (regex_compile): Simplyfy a for loop by eliminating dead iterations and unnecessary macro calls. diff --git a/src/regex.c b/src/regex.c index 5f51b43..41c1d3f 100644 --- a/src/regex.c +++ b/src/regex.c @@ -2888,22 +2888,18 @@ regex_compile (const_re_char *pattern, size_t size, done until now. */ SETUP_BUFFER_SYNTAX_TABLE (); - for (ch = 0; ch < 256; ++ch) - { - c = RE_CHAR_TO_MULTIBYTE (ch); - if (! CHAR_BYTE8_P (c) - && re_iswctype (c, cc)) - { - SET_LIST_BIT (ch); - c1 = TRANSLATE (c); - if (c1 == c) - continue; - if (ASCII_CHAR_P (c1)) - SET_LIST_BIT (c1); - else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) - SET_LIST_BIT (c1); - } - } + for (c = 0; c < 0x80; ++c) + if (re_iswctype (c, cc)) + { + SET_LIST_BIT (c); + c1 = TRANSLATE (c); + if (c1 == c) + continue; + if (ASCII_CHAR_P (c1)) + SET_LIST_BIT (c1); + else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) + SET_LIST_BIT (c1); + } SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work, re_wctype_to_bit (cc)); #endif /* emacs */ commit c579b28f6281c9cc0b711a012c30bf00036c60bf Author: Michal Nazarewicz Date: Wed Aug 3 03:08:48 2016 +0200 Replace decimalnump with alphanumericp decimalnump was used in regex.c only in ISALNUM macro which ored it with alphabeticp. Because both of those functions require Unicode general category lookup, this resulted in unnecessary lookups (if alphabeticp return false decimalp had to perform another lookup). Drop decimalnump in favour of alphanumericp which combines decimelnump with alphabeticp. * src/character.c (decimalnump): Remove in favour of… (alphanumericp): …new function. * src/regex.c (ISALNUM): Use alphanumericp. diff --git a/src/character.c b/src/character.c index 9f60aa7..b19e41d 100644 --- a/src/character.c +++ b/src/character.c @@ -983,17 +983,26 @@ alphabeticp (int c) || gen_cat == UNICODE_CATEGORY_Nl); } -/* Return true if C is a decimal-number character. */ +/* Return true if C is a alphabetic or decimal-number character. */ bool -decimalnump (int c) +alphanumericp (int c) { Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c); if (! INTEGERP (category)) return false; EMACS_INT gen_cat = XINT (category); - /* See UTS #18. */ - return gen_cat == UNICODE_CATEGORY_Nd; + /* See UTS #18. Same comment as for alphabeticp applies. FIXME. */ + return (gen_cat == UNICODE_CATEGORY_Lu + || gen_cat == UNICODE_CATEGORY_Ll + || gen_cat == UNICODE_CATEGORY_Lt + || gen_cat == UNICODE_CATEGORY_Lm + || gen_cat == UNICODE_CATEGORY_Lo + || gen_cat == UNICODE_CATEGORY_Mn + || gen_cat == UNICODE_CATEGORY_Mc + || gen_cat == UNICODE_CATEGORY_Me + || gen_cat == UNICODE_CATEGORY_Nl + || gen_cat == UNICODE_CATEGORY_Nd); } /* Return true if C is a graphic character. */ diff --git a/src/character.h b/src/character.h index 7f01bc6..2cb76b0 100644 --- a/src/character.h +++ b/src/character.h @@ -676,7 +676,7 @@ extern Lisp_Object Vchar_unify_table; extern Lisp_Object string_escape_byte8 (Lisp_Object); extern bool alphabeticp (int); -extern bool decimalnump (int); +extern bool alphanumericp (int); extern bool graphicp (int); extern bool printablep (int); diff --git a/src/regex.c b/src/regex.c index c808398..5f51b43 100644 --- a/src/regex.c +++ b/src/regex.c @@ -324,7 +324,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; ? (((c) >= 'a' && (c) <= 'z') \ || ((c) >= 'A' && (c) <= 'Z') \ || ((c) >= '0' && (c) <= '9')) \ - : (alphabeticp (c) || decimalnump (c))) + : alphanumericp (c)) # define ISALPHA(c) (IS_REAL_ASCII (c) \ ? (((c) >= 'a' && (c) <= 'z') \ commit b1c4c0050057ca982cab28b8ad11b3175c3bb0d2 Author: Michal Nazarewicz Date: Wed Aug 3 03:48:22 2016 +0200 Remove inaccurate comment in regex.c * src/regex.c (regex_compile): Remove comment indicating that wctype of some character classes may be negative. All wctypes are in fact non-negative. diff --git a/src/regex.c b/src/regex.c index c191f24..c808398 100644 --- a/src/regex.c +++ b/src/regex.c @@ -2879,8 +2879,7 @@ regex_compile (const_re_char *pattern, size_t size, /* Most character classes in a multibyte match just set a flag. Exceptions are is_blank, is_digit, is_cntrl, and is_xdigit, since they can only match ASCII characters. - We don't need to handle them for multibyte. They are - distinguished by a negative wctype. */ + We don't need to handle them for multibyte. */ /* Setup the gl_state object to its buffer-defined value. This hardcodes the buffer-global syntax-table for ASCII commit 3ffc5f44b09e2b500584545389efb8db3be04f95 Author: Michal Nazarewicz Date: Tue Sep 6 17:20:23 2016 +0200 STRING_CHAR does not unify characters; update documentation * src/character.h (STRING_CHAR): Update doc. * src/buffer.h (FETCH_MULTIBYTE_CHAR): Update doc. While at it, change the function to use BYTE_POS_ADDR instead of open-coding it. diff --git a/src/buffer.h b/src/buffer.h index 87b7cee..fa4866e 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -1182,23 +1182,12 @@ buffer_has_overlays (void) /* Return character code of multi-byte form at byte position POS. If POS doesn't point the head of valid multi-byte form, only the byte at - POS is returned. No range checking. - - WARNING: The character returned by this macro could be "unified" - inside STRING_CHAR, if the original character in the buffer belongs - to one of the Private Use Areas (PUAs) of codepoints that Emacs - uses to support non-unified CJK characters. If that happens, - CHAR_BYTES will return a value that is different from the length of - the original multibyte sequence stored in the buffer. Therefore, - do _not_ use FETCH_MULTIBYTE_CHAR if you need to advance through - the buffer to the next character after fetching this one. Instead, - use either FETCH_CHAR_ADVANCE or STRING_CHAR_AND_LENGTH. */ + POS is returned. No range checking. */ INLINE int FETCH_MULTIBYTE_CHAR (ptrdiff_t pos) { - unsigned char *p = ((pos >= GPT_BYTE ? GAP_SIZE : 0) - + pos + BEG_ADDR - BEG_BYTE); + unsigned char *p = BYTE_POS_ADDR (pos); return STRING_CHAR (p); } diff --git a/src/character.h b/src/character.h index 0d0e31c..7f01bc6 100644 --- a/src/character.h +++ b/src/character.h @@ -308,10 +308,7 @@ enum } \ } while (false) -/* Return the character code of character whose multibyte form is at - P. Note that this macro unifies CJK characters whose codepoints - are in the Private Use Areas (PUAs), so it might return a different - codepoint from the one actually stored at P. */ +/* Return the character code of character whose multibyte form is at P. */ #define STRING_CHAR(p) \ (!((p)[0] & 0x80) \