Handle non ascii characters when parsing for ST2.

skuroda · skuroda · commit 951fc5aeabaa · 2014-11-01T19:08:58.000-07:00
diff --git a/README.md b/README.md
@@ -253,6 +253,9 @@ To specify the current working directory, simply type a colon, without any prece
 ## Notes
 Thanks to Dima Kukushkin ([xobb1t](https://github.com/xobb1t)) for the original work on this plugin. Also, thank you to [facelessuser](https://github.com/facelessuser), and by extension biermeester and matthjes for the idea of platform specific settings. Additional thanks to [kemayo](https://github.com/kemayo) for the work in identifying git executable.
 
+### Libraries Used
+* [ushlex](https://bitbucket.org/mixmastamyk/ushlex) - Improved version of shlex, supporting unicode characters for Python 2.
+
 ### Contributors
 * [alirezadot](https://github.com/alirezadot)
 * [aventurella](https://github.com/aventurella)
diff --git a/advanced_new_file/commands/command_base.py b/advanced_new_file/commands/command_base.py
@@ -10,8 +10,10 @@
 from ..completions.nix_completion import NixCompletion
 from ..completions.windows_completion import WindowsCompletion
 
-VIEW_NAME = "AdvancedNewFileCreation"
+if not IS_ST3:
+    from ..lib.ushlex import split as st2_shlex_split
 
+VIEW_NAME = "AdvancedNewFileCreation"
 
 class AdvancedNewFileBase(object):
 
@@ -121,14 +123,32 @@ def __validate_folder_index(self, folder_index):
             folder_index = 0
         return folder_index
 
+    def __parse_for_shell_input(self, path):
+        if not IS_ST3 and self.__contains_non_ascii(path):
+            split_path = self.__split_shell_input_for_st2_non_ascii(path)
+        else:
+            split_path = shlex.split(str(path))
+
+        return " ".join(split_path)
+
+    def __split_shell_input_for_st2_non_ascii(self, path):
+        return st2_shlex_split(path)
+
+    def __contains_non_ascii(self, string):
+        # Don't really like this....
+        try:
+            string.decode("ascii")
+        except UnicodeEncodeError, e:
+            return True
+        return False
+
     def split_path(self, path=""):
         HOME_REGEX = r"^~[/\\]"
         root = None
         try:
             root, path = self.platform.split(path)
             if self.settings.get(SHELL_INPUT_SETTING, False) and len(path) > 0:
-                split_path = shlex.split(path)
-                path = " ".join(split_path)
+                path = self.__parse_for_shell_input(path)
             # Parse if alias
             if TOP_LEVEL_SPLIT_CHAR in path and root is None:
                 parts = path.rsplit(TOP_LEVEL_SPLIT_CHAR, 1)
@@ -403,3 +423,17 @@ def _find_open_file(self, file_name):
                 if view_name != "" and view_name == file_name:
                     return view
         return None
+
+def test_split(s, comments=False, posix=True):
+    is_str = False
+    if type(s) is str:
+        s = unicode(s)
+        is_str = True
+    lex = shlex(s, posix=posix)
+    lex.whitespace_split = True
+    if not comments:
+        lex.commenters = ''
+    if is_str:
+        return [ str(x) for x in list(lex) ]
+    else:
+        return list(lex)
diff --git a/advanced_new_file/lib/ushlex.py b/advanced_new_file/lib/ushlex.py
@@ -0,0 +1,299 @@
+# -*- coding: utf-8 -*-
+"""A lexical analyzer class for simple shell-like syntaxes."""
+# Source https://bitbucket.org/mixmastamyk/ushlex
+
+# Module and documentation by Eric S. Raymond, 21 Dec 1998
+# Input stacking and error message cleanup added by ESR, March 2000
+# push_source() and pop_source() made explicit by ESR, January 2001.
+# Posix compliance, split(), string arguments, and
+# iterator interface by Gustavo Niemeyer, April 2003.
+# Modified to support Unicode by Colin Walters, Dec 2007
+
+import os.path
+import sys
+import unicodedata
+from collections import deque
+from StringIO import StringIO
+
+__all__ = ["shlex", "split"]
+
+class shlex:
+    "A lexical analyzer class for simple shell-like syntaxes."
+    def __init__(self, instream=None, infile=None, posix=False, utf=True):
+        if isinstance(instream, basestring):
+            instream = StringIO(instream)
+        if instream is not None:
+            self.instream = instream
+            self.infile = infile
+        else:
+            self.instream = sys.stdin
+            self.infile = None
+        self.posix = posix
+        if posix:
+            self.eof = None
+        else:
+            self.eof = ''
+        self.utf = utf
+        self.commenters = '#'
+        self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
+                          'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
+        if self.posix and not self.utf:
+            self.wordchars += ('ÃŸÃ Ã¡Ã¢Ã£Ã¤Ã¥Ã¦Ã§Ã¨Ã©ÃªÃ«Ã¬Ã­Ã®Ã¯Ã°Ã±Ã²Ã³Ã´ÃµÃ¶Ã¸Ã¹ÃºÃ»Ã¼Ã½Ã¾Ã¿'
+                               'Ã€ÃÃ‚ÃƒÃ„Ã…Ã†Ã‡ÃˆÃ‰ÃŠÃ‹ÃŒÃÃŽÃÃÃ‘Ã’Ã“Ã”Ã•Ã–Ã˜Ã™ÃšÃ›ÃœÃÃž')
+        elif self.posix:
+            # We dynamically determine character classes below, except
+            # by default _ is a word character
+            self.wordchars = '_'
+        self.whitespace = ' \t\r\n'
+        self.whitespace_split = False
+        self.quotes = '\'"'
+        self.escape = '\\'
+        self.escapedquotes = '"'
+        self.state = ' '
+        self.pushback = deque()
+        self.lineno = 1
+        self.debug = 0
+        self.token = ''
+        self.filestack = deque()
+        self.source = None
+        if self.debug:
+            print 'shlex: reading from %s, line %d' \
+                  % (self.instream, self.lineno)
+
+    def push_token(self, tok):
+        "Push a token onto the stack popped by the get_token method"
+        if self.debug >= 1:
+            print "shlex: pushing token " + repr(tok)
+        self.pushback.appendleft(tok)
+
+    def push_source(self, newstream, newfile=None):
+        "Push an input source onto the lexer's input source stack."
+        if isinstance(newstream, basestring):
+            newstream = StringIO(newstream)
+        self.filestack.appendleft((self.infile, self.instream, self.lineno))
+        self.infile = newfile
+        self.instream = newstream
+        self.lineno = 1
+        if self.debug:
+            if newfile is not None:
+                print 'shlex: pushing to file %s' % (self.infile,)
+            else:
+                print 'shlex: pushing to stream %s' % (self.instream,)
+
+    def pop_source(self):
+        "Pop the input source stack."
+        self.instream.close()
+        (self.infile, self.instream, self.lineno) = self.filestack.popleft()
+        if self.debug:
+            print 'shlex: popping to %s, line %d' \
+                  % (self.instream, self.lineno)
+        self.state = ' '
+
+    def get_token(self):
+        "Get a token from the input stream (or from stack if it's nonempty)"
+        if self.pushback:
+            tok = self.pushback.popleft()
+            if self.debug >= 1:
+                print "shlex: popping token " + repr(tok)
+            return tok
+        # No pushback.  Get a token.
+        raw = self.read_token()
+        # Handle inclusions
+        if self.source is not None:
+            while raw == self.source:
+                spec = self.sourcehook(self.read_token())
+                if spec:
+                    (newfile, newstream) = spec
+                    self.push_source(newstream, newfile)
+                raw = self.get_token()
+        # Maybe we got EOF instead?
+        while raw == self.eof:
+            if not self.filestack:
+                return self.eof
+            else:
+                self.pop_source()
+                raw = self.get_token()
+        # Neither inclusion nor EOF
+        if self.debug >= 1:
+            if raw != self.eof:
+                print "shlex: token=" + repr(raw)
+            else:
+                print "shlex: token=EOF"
+        return raw
+
+    def __is_whitespace(self, c, category):
+        return c in self.whitespace or (self.utf and category[0] == 'Z')
+
+    def __is_wordchar(self, c, category):
+        return c in self.wordchars or (self.utf and category[0] in ('L', 'N'))
+
+    def read_token(self):
+        quoted = False
+        escapedstate = ' '
+        while True:
+            nextchar = self.instream.read(1)
+            if nextchar and self.utf:
+                nextcategory = unicodedata.category(nextchar)
+            else:
+                nextcategory = None
+            if nextchar == '\n':
+                self.lineno = self.lineno + 1
+            if self.debug >= 3:
+                print "shlex: in state", repr(self.state), \
+                      "I see character:", repr(nextchar)
+            if self.state is None:
+                self.token = ''        # past end of file
+                break
+            elif self.state == ' ':
+                if not nextchar:
+                    self.state = None  # end of file
+                    break
+                if self.__is_whitespace(nextchar, nextcategory):
+                    if self.debug >= 2:
+                        print "shlex: I see whitespace in whitespace state"
+                    if self.token or (self.posix and quoted):
+                        break   # emit current token
+                    else:
+                        continue
+                elif nextchar in self.commenters:
+                    self.instream.readline()
+                    self.lineno = self.lineno + 1
+                elif self.posix and nextchar in self.escape:
+                    escapedstate = 'a'
+                    self.state = nextchar
+                elif self.__is_wordchar(nextchar, nextcategory):
+                    self.token = nextchar
+                    self.state = 'a'
+                elif nextchar in self.quotes:
+                    if not self.posix:
+                        self.token = nextchar
+                    self.state = nextchar
+                elif self.whitespace_split:
+                    self.token = nextchar
+                    self.state = 'a'
+                else:
+                    self.token = nextchar
+                    if self.token or (self.posix and quoted):
+                        break   # emit current token
+                    else:
+                        continue
+            elif self.state in self.quotes:
+                quoted = True
+                if not nextchar:      # end of file
+                    if self.debug >= 2:
+                        print "shlex: I see EOF in quotes state"
+                    # XXX what error should be raised here?
+                    raise ValueError, "No closing quotation"
+                if nextchar == self.state:
+                    if not self.posix:
+                        self.token = self.token + nextchar
+                        self.state = ' '
+                        break
+                    else:
+                        self.state = 'a'
+                elif self.posix and nextchar in self.escape and \
+                     self.state in self.escapedquotes:
+                    escapedstate = self.state
+                    self.state = nextchar
+                else:
+                    self.token = self.token + nextchar
+            elif self.state in self.escape:
+                if not nextchar:      # end of file
+                    if self.debug >= 2:
+                        print "shlex: I see EOF in escape state"
+                    # XXX what error should be raised here?
+                    raise ValueError, "No escaped character"
+                # In posix shells, only the quote itself or the escape
+                # character may be escaped within quotes.
+                if escapedstate in self.quotes and \
+                   nextchar != self.state and nextchar != escapedstate:
+                    self.token = self.token + self.state
+                self.token = self.token + nextchar
+                self.state = escapedstate
+            elif self.state == 'a':
+                if not nextchar:
+                    self.state = None   # end of file
+                    break
+                if self.__is_whitespace(nextchar, nextcategory):
+                    if self.debug >= 2:
+                        print "shlex: I see whitespace in word state"
+                    self.state = ' '
+                    if self.token or (self.posix and quoted):
+                        break   # emit current token
+                    else:
+                        continue
+                elif nextchar in self.commenters:
+                    self.instream.readline()
+                    self.lineno = self.lineno + 1
+                    if self.posix:
+                        self.state = ' '
+                        if self.token or (self.posix and quoted):
+                            break   # emit current token
+                        else:
+                            continue
+                elif self.posix and nextchar in self.quotes:
+                    self.state = nextchar
+                elif self.posix and nextchar in self.escape:
+                    escapedstate = 'a'
+                    self.state = nextchar
+                elif self.__is_wordchar(nextchar, nextcategory) or nextchar in self.quotes \
+                    or self.whitespace_split:
+                    self.token = self.token + nextchar
+                else:
+                    self.pushback.appendleft(nextchar)
+                    if self.debug >= 2:
+                        print "shlex: I see punctuation in word state"
+                    self.state = ' '
+                    if self.token:
+                        break   # emit current token
+                    else:
+                        continue
+        result = self.token
+        self.token = ''
+        if self.posix and not quoted and result == '':
+            result = None
+        if self.debug > 1:
+            if result:
+                print "shlex: raw token=" + repr(result)
+            else:
+                print "shlex: raw token=EOF"
+        return result
+
+    def sourcehook(self, newfile, encoding='utf-8'):
+        "Hook called on a filename to be sourced."
+        from codecs import open
+        if newfile[0] == '"':
+            newfile = newfile[1:-1]
+        # This implements cpp-like semantics for relative-path inclusion.
+        if isinstance(self.infile, basestring) and not os.path.isabs(newfile):
+            newfile = os.path.join(os.path.dirname(self.infile), newfile)
+        return (newfile, open(newfile, "r", encoding))
+
+    def error_leader(self, infile=None, lineno=None):
+        "Emit a C-compiler-like, Emacs-friendly error-message leader."
+        if infile is None:
+            infile = self.infile
+        if lineno is None:
+            lineno = self.lineno
+        return "\"%s\", line %d: " % (infile, lineno)
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        token = self.get_token()
+        if token == self.eof:
+            raise StopIteration
+        return token
+
+def split(s, comments=False, posix=True):
+    is_str = False
+    if type(s) is str:
+        s = unicode(s)
+        is_str = True
+    lex = shlex(s, posix=posix)
+    lex.whitespace_split = True
+    if not comments:
+        lex.commenters = ''
+    if is_str:  return [ str(x) for x in list(lex) ]
+    else:       return list(lex)
diff --git a/advanced_new_file/reloader.py b/advanced_new_file/reloader.py
@@ -27,6 +27,7 @@
 
     ".lib",
     ".lib.package_resources",
+    ".lib.ushlex",
 
     ".completions",
     '.completions.nix_completion',