Fix bugs in raw strings and character literals

author: Micah Chalmer <micah@micahchalmer.net> 2015-06-08 01:25:36 -0400
committer: Micah Chalmer <micah@micahchalmer.net> 2015-07-05 21:36:33 -0400
commit: 6e72f647da716bf532013484515e5f7ebde60209 (patch)
tree: e77425fd8d0f3a2238af7abea6a3cb97d758d5b4
parent: 53c558cf7241cbeca93f18e04a81a25ef8675313 (diff)
download: rust-mode-6e72f647da716bf532013484515e5f7ebde60209.tar.gz
2 files changed, 196 insertions, 86 deletions
diff --git a/rust-mode-tests.el b/rust-mode-tests.el
index c7446d6..a846a27 100644
--- a/rust-mode-tests.el
+++ b/rust-mode-tests.el
@@ -1482,3 +1482,83 @@ la la\");
   (test-indent
    ;; Needs to leave 1 space before "world"
    "\"hello \\\n world\""))
+
+(defun rust-test-matching-parens (content pairs &optional nonparen-positions)
+  "Assert that in rust-mode, given a buffer with the given `content',
+  emacs's paren matching will find all of the pairs of positions
+  as matching braces.  The list of nonparen-positions asserts
+  specific positions that should NOT be considered to be
+  parens/braces of any kind.
+
+  This does not assert that the `pairs' list is
+  comprehensive--there can be additional pairs that don't appear
+  in the list and the test still passes (as long as none of their
+  positions appear in `nonparen-positions'.)"
+  (with-temp-buffer
+    (rust-mode)
+    (insert content)
+    (font-lock-fontify-buffer)
+    (dolist (pair pairs)
+      (let* ((open-pos (nth 0 pair))
+             (close-pos (nth 1 pair)))
+        (should (equal 4 (syntax-class (syntax-after open-pos))))
+        (should (equal 5 (syntax-class (syntax-after close-pos))))
+        (should (equal (scan-sexps open-pos 1) (+ 1 close-pos)))
+        (should (equal (scan-sexps (+ 1 close-pos) -1) open-pos))))
+    (dolist (nonpar-pos nonparen-positions)
+      (let ((nonpar-syntax-class (syntax-class (syntax-after nonpar-pos))))
+        (should (not (equal 4 nonpar-syntax-class)))
+        (should (not (equal 5 nonpar-syntax-class)))))))
+
+(ert-deftest rust-test-unmatched-single-quote-in-comment-paren-matching ()
+  ;; This was a bug from the char quote handling that affected the paren
+  ;; matching.  An unmatched quote char in a comment caused the problems.
+  (rust-test-matching-parens
+   "// If this appeared first in the file...
+\"\\
+{\";
+
+// And the { was not the on the first column:
+ {
+    // This then messed up the paren matching: '\\'
+}
+
+"
+   '((97 150) ;; The { and } at the bottom
+     )))
+
+(ert-deftest rust-test-two-character-quotes-in-a-row ()
+  (with-temp-buffer
+    (rust-mode)
+    (font-lock-fontify-buffer)
+    (insert "'\\n','a', fn")
+    (font-lock-after-change-function 1 12 0)
+
+    (should (equal 'font-lock-string-face (get-text-property 3 'face)))
+    (should (equal nil (get-text-property 5 'face)))
+    (should (equal 'font-lock-string-face (get-text-property 7 'face)))
+    (should (equal nil (get-text-property 9 'face)))
+    (should (equal 'font-lock-keyword-face (get-text-property 12 'face)))
+    )  
+  )
+
+(ert-deftest single-quote-null-char ()
+  (rust-test-font-lock
+   "'\\0' 'a' fn"
+   '("'\\0'" font-lock-string-face
+     "'a'" font-lock-string-face
+     "fn" font-lock-keyword-face)))
+
+(ert-deftest r-in-string-after-single-quoted-double-quote ()
+  (rust-test-font-lock
+   "'\"';\n\"r\";\n\"oops\";"
+   '("'\"'" font-lock-string-face
+     "\"r\"" font-lock-string-face
+     "\"oops\"" font-lock-string-face
+     )))
+
+(ert-deftest char-literal-after-quote-in-raw-string ()
+  (rust-test-font-lock
+   "r#\"\"\"#;\n'q'"
+   '("r#\"\"\"#" font-lock-string-face
+     "'q'" font-lock-string-face)))
diff --git a/rust-mode.el b/rust-mode.el
index ae42052..193f672 100644
--- a/rust-mode.el
+++ b/rust-mode.el
@@ -374,38 +374,53 @@
              ("fn" . font-lock-function-name-face)
              ("static" . font-lock-constant-face)))))
 
-(defun rust-extend-region-raw-string ()
+(defun rust-font-lock-extend-region ()
   "Extend the region given by `font-lock-beg' and `font-lock-end'
-  to include the beginning of a string if it includes part of it.
-  Adjusts to include the r[#] of a raw string as well."
-  
-  (let* ((orig-beg font-lock-beg)
-         (orig-end font-lock-end)
-         (beg-ppss (syntax-ppss font-lock-beg))
-         (beg-in-str (nth 3 beg-ppss))
-         (end-ppss (syntax-ppss font-lock-end))
-         (end-in-str (nth 3 end-ppss)))
-    
-    (when (and beg-in-str (> font-lock-beg (nth 8 beg-ppss)))
-      (setq font-lock-beg str-beg)
-      (while (equal ?# (char-before font-lock-beg))
-        (setq font-lock-beg (1- font-lock-beg)))
-      (when (equal ?r (char-before font-lock-beg))
-        (setq font-lock-beg (1- font-lock-beg))))
-    
-    (when end-in-str
-      (save-excursion
-        (goto-char (nth 8 end-ppss))
-        (ignore-errors (forward-sexp))
-        (setq font-lock-end (max font-lock-end (point)))))
-    
-    ;; If we have the beginning of a raw string in the region, make sure we have the end of
-    ;; it.
-    (when (or beg-in-str end-in-str)
-      (save-excursion
-        (goto-char font-lock-beg)
-        (while (and (< (point) font-lock-end) (ignore-errors (rust-look-for-raw-string (buffer-end 1)))))
-        (setq font-lock-end (max font-lock-end (point)))))
+  to include the beginning of a string or comment if it includes
+  part of it.  Adjusts to include the r[#] of a raw string as
+  well."
+
+  (let ((orig-beg font-lock-beg)
+        (orig-end font-lock-end))
+    (cond
+     ;; If we are not syntactically fontified yet, we cannot correctly cover
+     ;; anything less than the full buffer. The syntactic fontification
+     ;; modifies the syntax, so until it's done we can't use the syntax to
+     ;; determine what to fontify.
+     ((< (or font-lock-syntactically-fontified 0) font-lock-end)
+      (setq font-lock-beg 1)
+      (setq font-lock-end (buffer-end 1)))
+     
+     ((let* ((beg-ppss (syntax-ppss font-lock-beg))
+             (beg-in-cmnt (and (nth 4 beg-ppss) (nth 8 beg-ppss)))
+             (beg-in-str (nth 3 beg-ppss))
+             (end-ppss (syntax-ppss font-lock-end))
+             (end-in-str (nth 3 end-ppss)))
+        
+        (when (and beg-in-str (> font-lock-beg (nth 8 beg-ppss)))
+          (setq font-lock-beg (nth 8 beg-ppss))
+          (while (equal ?# (char-before font-lock-beg))
+            (setq font-lock-beg (1- font-lock-beg)))
+          (when (equal ?r (char-before font-lock-beg))
+            (setq font-lock-beg (1- font-lock-beg))))
+
+        (when (and beg-in-cmnt (> font-lock-beg beg-in-cmnt))
+          (setq font-lock-beg beg-in-cmnt))
+        
+        (when end-in-str
+          (save-excursion
+            (goto-char (nth 8 end-ppss))
+            (ignore-errors (forward-sexp))
+            (setq font-lock-end (max font-lock-end (point)))))
+        
+        ;; If we have the beginning of a raw string in the region, make sure we have the end of
+        ;; it.
+        (when (or beg-in-str end-in-str)
+          (save-excursion
+            (goto-char font-lock-beg)
+            (while (and (< (point) font-lock-end) (ignore-errors (rust-look-for-raw-string (buffer-end 1)))))
+            (setq font-lock-end (max font-lock-end (point)))))
+        )))
 
     (or (/= font-lock-beg orig-beg)
         (/= font-lock-end orig-end))
@@ -437,67 +452,82 @@
       (set-match-data (nth 1 ret-list))
       (nth 0 ret-list))))
 
-(defun rust-look-for-raw-string (bound)
-  ;; Find a raw string, but only if it's not in the middle of another string or
-  ;; a comment
+(defun rust-look-for-non-standard-string (bound)
+  ;; Find a raw string or character literal, but only if it's not in the middle
+  ;; of another string or a comment.
 
-  (let* ((raw-str-regexp
+  (let* ((non-standard-str-regexp
           (rx
-           (seq
-            ;; The "r" starts the raw string.  Capture it as group 1 to mark it as such syntactically:
-            (group "r")
-
-            ;; Then either:
-            (or
-             ;; a sequence at least one "#" (followed by quote).  Capture all
-             ;; but the last "#" as group 2 for this case.
-             (seq (group (* "#")) "#\"")
-
-             ;; ...or a quote without any "#".  Capture it as group 3. This is
-             ;; used later to match the opposite quote only if this capture
-             ;; occurred
-             (group "\""))
-
-            ;; The contents of the string:
-            (*? anything)
-
-            ;; If there are any backslashes at the end of the string, capture
-            ;; them as group 4 so we can suppress the normal escape syntax
-            ;; parsing:
-            (group (* "\\"))
-
-            ;; Then the end of the string--the backreferences ensure that we
-            ;; only match the kind of ending that corresponds to the beginning
-            ;; we had:
-            (or
-             ;; There were "#"s - capture the last one as group 5 to mark it as
-             ;; the end of the string:
-             (seq "\"" (backref 2) (group "#"))
-
-             ;; No "#"s - capture the ending quote (using a backref to group 3,
-             ;; so that we can't match a quote if we had "#"s) as group 6
-             (group (backref 3))))
-           ;; If it matches, it ends up with the starting character of the string
-           ;; as group 1, any ending backslashes as group 4, and the ending
-           ;; character as either group 5 or group 6.
+           (or
+            ;; Raw string: if it matches, it ends up with the starting character
+            ;; of the string as group 1, any ending backslashes as group 4, and
+            ;; the ending character as either group 5 or group 6.
+            (seq
+             ;; The "r" starts the raw string.  Capture it as group 1 to mark it as such syntactically:
+             (group "r")
+
+             ;; Then either:
+             (or
+              ;; a sequence at least one "#" (followed by quote).  Capture all
+              ;; but the last "#" as group 2 for this case.
+              (seq (group (* "#")) "#\"")
+
+              ;; ...or a quote without any "#".  Capture it as group 3. This is
+              ;; used later to match the opposite quote only if this capture
+              ;; occurred
+              (group "\""))
+
+             ;; The contents of the string:
+             (*? anything)
+
+             ;; If there are any backslashes at the end of the string, capture
+             ;; them as group 4 so we can suppress the normal escape syntax
+             ;; parsing:
+             (group (* "\\"))
+
+             ;; Then the end of the string--the backreferences ensure that we
+             ;; only match the kind of ending that corresponds to the beginning
+             ;; we had:
+             (or
+              ;; There were "#"s - capture the last one as group 5 to mark it as
+              ;; the end of the string:
+              (seq "\"" (backref 2) (group "#"))
+
+              ;; No "#"s - capture the ending quote (using a backref to group 3,
+              ;; so that we can't match a quote if we had "#"s) as group 6
+              (group (backref 3))))
+
+            ;; Character literal: match the beginning ' of a character literal
+            ;; as group 7, and the ending one as group 8
+            (seq
+             (group "'")
+             (or
+              (seq
+               "\\"
+               (or
+                (: "U" (= 8 xdigit))
+                (: "u" (= 4 xdigit))
+                (: "x" (= 2 xdigit))
+                (any "'nrt0\"\\")))
+              (not (any "'\\"))
+              )
+             (group "'"))
+            )
            )))
     (rust-conditional-re-search-forward
-     raw-str-regexp bound
-     (lambda () (save-excursion
-                  (goto-char (match-beginning 0))
-                  (not (rust-in-str-or-cmnt)))))))
+     non-standard-str-regexp bound
+     (lambda ()
+       (let ((pstate (syntax-ppss (match-beginning 0))))
+         (not
+          (or
+           (nth 4 pstate) ;; Skip if in a comment
+           (and (nth 3 pstate) (wholenump (nth 8 pstate)) (< (nth 8 pstate) (match-beginning 0))) ;; Skip if in a string that isn't starting here
+           )))))))
 
 (defvar rust-mode-font-lock-syntactic-keywords
   (append
-   ;; Handle single quoted character literals:
-   (mapcar (lambda (re) (list re '(1 "\"") '(2 "\"")))
-           '("\\('\\)[^']\\('\\)"
-             "\\('\\)\\\\['nrt\"\\]\\('\\)"
-             "\\('\\)\\\\x[[:xdigit:]]\\{2\\}\\('\\)"
-             "\\('\\)\\\\u[[:xdigit:]]\\{4\\}\\('\\)"
-             "\\('\\)\\\\U[[:xdigit:]]\\{8\\}\\('\\)"))
-   ;; Handle raw strings:
-   `((rust-look-for-raw-string (1 "|") (4 "_" nil t) (5 "|" nil t) (6 "|" nil t)))))
+   ;; Handle raw strings and character literals:
+   `((rust-look-for-non-standard-string (1 "|" nil t) (4 "_" nil t) (5 "|" nil t) (6 "|" nil t) (7 "\"" nil t) (8 "\"" nil t)))))
 
 (defun rust-mode-syntactic-face-function (state)
   "Syntactic face function to distinguish doc comments from other comments."
@@ -768,7 +798,7 @@ This is written mainly to be used as `end-of-defun-function' for Rust."
   (setq-local indent-line-function 'rust-mode-indent-line)
 
   ;; Fonts
-  (add-to-list 'font-lock-extend-region-functions 'rust-extend-region-raw-string)
+  (add-to-list 'font-lock-extend-region-functions 'rust-font-lock-extend-region)
   (setq-local font-lock-defaults '(rust-mode-font-lock-keywords
                                    nil nil nil nil
                                    (font-lock-syntactic-keywords . rust-mode-font-lock-syntactic-keywords)
author	Micah Chalmer <micah@micahchalmer.net>	2015-06-08 01:25:36 -0400
committer	Micah Chalmer <micah@micahchalmer.net>	2015-07-05 21:36:33 -0400
commit	6e72f647da716bf532013484515e5f7ebde60209 (patch)
tree	e77425fd8d0f3a2238af7abea6a3cb97d758d5b4
parent	53c558cf7241cbeca93f18e04a81a25ef8675313 (diff)
download	rust-mode-6e72f647da716bf532013484515e5f7ebde60209.tar.gz