Skip to content

Commit 951fc5a

Browse files
committed
Handle non ascii characters when parsing for ST2.
1 parent 5200445 commit 951fc5a

File tree

4 files changed

+340
-3
lines changed

4 files changed

+340
-3
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,9 @@ To specify the current working directory, simply type a colon, without any prece
253253
## Notes
254254
Thanks to Dima Kukushkin ([xobb1t](https://github.com/xobb1t)) for the original work on this plugin. Also, thank you to [facelessuser](https://github.com/facelessuser), and by extension biermeester and matthjes for the idea of platform specific settings. Additional thanks to [kemayo](https://github.com/kemayo) for the work in identifying git executable.
255255

256+
### Libraries Used
257+
* [ushlex](https://bitbucket.org/mixmastamyk/ushlex) - Improved version of shlex, supporting unicode characters for Python 2.
258+
256259
### Contributors
257260
* [alirezadot](https://github.com/alirezadot)
258261
* [aventurella](https://github.com/aventurella)

advanced_new_file/commands/command_base.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
from ..completions.nix_completion import NixCompletion
1111
from ..completions.windows_completion import WindowsCompletion
1212

13-
VIEW_NAME = "AdvancedNewFileCreation"
13+
if not IS_ST3:
14+
from ..lib.ushlex import split as st2_shlex_split
1415

16+
VIEW_NAME = "AdvancedNewFileCreation"
1517

1618
class AdvancedNewFileBase(object):
1719

@@ -121,14 +123,32 @@ def __validate_folder_index(self, folder_index):
121123
folder_index = 0
122124
return folder_index
123125

126+
def __parse_for_shell_input(self, path):
127+
if not IS_ST3 and self.__contains_non_ascii(path):
128+
split_path = self.__split_shell_input_for_st2_non_ascii(path)
129+
else:
130+
split_path = shlex.split(str(path))
131+
132+
return " ".join(split_path)
133+
134+
def __split_shell_input_for_st2_non_ascii(self, path):
135+
return st2_shlex_split(path)
136+
137+
def __contains_non_ascii(self, string):
138+
# Don't really like this....
139+
try:
140+
string.decode("ascii")
141+
except UnicodeEncodeError, e:
142+
return True
143+
return False
144+
124145
def split_path(self, path=""):
125146
HOME_REGEX = r"^~[/\\]"
126147
root = None
127148
try:
128149
root, path = self.platform.split(path)
129150
if self.settings.get(SHELL_INPUT_SETTING, False) and len(path) > 0:
130-
split_path = shlex.split(path)
131-
path = " ".join(split_path)
151+
path = self.__parse_for_shell_input(path)
132152
# Parse if alias
133153
if TOP_LEVEL_SPLIT_CHAR in path and root is None:
134154
parts = path.rsplit(TOP_LEVEL_SPLIT_CHAR, 1)
@@ -403,3 +423,17 @@ def _find_open_file(self, file_name):
403423
if view_name != "" and view_name == file_name:
404424
return view
405425
return None
426+
427+
def test_split(s, comments=False, posix=True):
428+
is_str = False
429+
if type(s) is str:
430+
s = unicode(s)
431+
is_str = True
432+
lex = shlex(s, posix=posix)
433+
lex.whitespace_split = True
434+
if not comments:
435+
lex.commenters = ''
436+
if is_str:
437+
return [ str(x) for x in list(lex) ]
438+
else:
439+
return list(lex)

advanced_new_file/lib/ushlex.py

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
# -*- coding: utf-8 -*-
2+
"""A lexical analyzer class for simple shell-like syntaxes."""
3+
# Source https://bitbucket.org/mixmastamyk/ushlex
4+
5+
# Module and documentation by Eric S. Raymond, 21 Dec 1998
6+
# Input stacking and error message cleanup added by ESR, March 2000
7+
# push_source() and pop_source() made explicit by ESR, January 2001.
8+
# Posix compliance, split(), string arguments, and
9+
# iterator interface by Gustavo Niemeyer, April 2003.
10+
# Modified to support Unicode by Colin Walters, Dec 2007
11+
12+
import os.path
13+
import sys
14+
import unicodedata
15+
from collections import deque
16+
from StringIO import StringIO
17+
18+
__all__ = ["shlex", "split"]
19+
20+
class shlex:
21+
"A lexical analyzer class for simple shell-like syntaxes."
22+
def __init__(self, instream=None, infile=None, posix=False, utf=True):
23+
if isinstance(instream, basestring):
24+
instream = StringIO(instream)
25+
if instream is not None:
26+
self.instream = instream
27+
self.infile = infile
28+
else:
29+
self.instream = sys.stdin
30+
self.infile = None
31+
self.posix = posix
32+
if posix:
33+
self.eof = None
34+
else:
35+
self.eof = ''
36+
self.utf = utf
37+
self.commenters = '#'
38+
self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
39+
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
40+
if self.posix and not self.utf:
41+
self.wordchars += ('ßà áâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
42+
'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
43+
elif self.posix:
44+
# We dynamically determine character classes below, except
45+
# by default _ is a word character
46+
self.wordchars = '_'
47+
self.whitespace = ' \t\r\n'
48+
self.whitespace_split = False
49+
self.quotes = '\'"'
50+
self.escape = '\\'
51+
self.escapedquotes = '"'
52+
self.state = ' '
53+
self.pushback = deque()
54+
self.lineno = 1
55+
self.debug = 0
56+
self.token = ''
57+
self.filestack = deque()
58+
self.source = None
59+
if self.debug:
60+
print 'shlex: reading from %s, line %d' \
61+
% (self.instream, self.lineno)
62+
63+
def push_token(self, tok):
64+
"Push a token onto the stack popped by the get_token method"
65+
if self.debug >= 1:
66+
print "shlex: pushing token " + repr(tok)
67+
self.pushback.appendleft(tok)
68+
69+
def push_source(self, newstream, newfile=None):
70+
"Push an input source onto the lexer's input source stack."
71+
if isinstance(newstream, basestring):
72+
newstream = StringIO(newstream)
73+
self.filestack.appendleft((self.infile, self.instream, self.lineno))
74+
self.infile = newfile
75+
self.instream = newstream
76+
self.lineno = 1
77+
if self.debug:
78+
if newfile is not None:
79+
print 'shlex: pushing to file %s' % (self.infile,)
80+
else:
81+
print 'shlex: pushing to stream %s' % (self.instream,)
82+
83+
def pop_source(self):
84+
"Pop the input source stack."
85+
self.instream.close()
86+
(self.infile, self.instream, self.lineno) = self.filestack.popleft()
87+
if self.debug:
88+
print 'shlex: popping to %s, line %d' \
89+
% (self.instream, self.lineno)
90+
self.state = ' '
91+
92+
def get_token(self):
93+
"Get a token from the input stream (or from stack if it's nonempty)"
94+
if self.pushback:
95+
tok = self.pushback.popleft()
96+
if self.debug >= 1:
97+
print "shlex: popping token " + repr(tok)
98+
return tok
99+
# No pushback. Get a token.
100+
raw = self.read_token()
101+
# Handle inclusions
102+
if self.source is not None:
103+
while raw == self.source:
104+
spec = self.sourcehook(self.read_token())
105+
if spec:
106+
(newfile, newstream) = spec
107+
self.push_source(newstream, newfile)
108+
raw = self.get_token()
109+
# Maybe we got EOF instead?
110+
while raw == self.eof:
111+
if not self.filestack:
112+
return self.eof
113+
else:
114+
self.pop_source()
115+
raw = self.get_token()
116+
# Neither inclusion nor EOF
117+
if self.debug >= 1:
118+
if raw != self.eof:
119+
print "shlex: token=" + repr(raw)
120+
else:
121+
print "shlex: token=EOF"
122+
return raw
123+
124+
def __is_whitespace(self, c, category):
125+
return c in self.whitespace or (self.utf and category[0] == 'Z')
126+
127+
def __is_wordchar(self, c, category):
128+
return c in self.wordchars or (self.utf and category[0] in ('L', 'N'))
129+
130+
def read_token(self):
131+
quoted = False
132+
escapedstate = ' '
133+
while True:
134+
nextchar = self.instream.read(1)
135+
if nextchar and self.utf:
136+
nextcategory = unicodedata.category(nextchar)
137+
else:
138+
nextcategory = None
139+
if nextchar == '\n':
140+
self.lineno = self.lineno + 1
141+
if self.debug >= 3:
142+
print "shlex: in state", repr(self.state), \
143+
"I see character:", repr(nextchar)
144+
if self.state is None:
145+
self.token = '' # past end of file
146+
break
147+
elif self.state == ' ':
148+
if not nextchar:
149+
self.state = None # end of file
150+
break
151+
if self.__is_whitespace(nextchar, nextcategory):
152+
if self.debug >= 2:
153+
print "shlex: I see whitespace in whitespace state"
154+
if self.token or (self.posix and quoted):
155+
break # emit current token
156+
else:
157+
continue
158+
elif nextchar in self.commenters:
159+
self.instream.readline()
160+
self.lineno = self.lineno + 1
161+
elif self.posix and nextchar in self.escape:
162+
escapedstate = 'a'
163+
self.state = nextchar
164+
elif self.__is_wordchar(nextchar, nextcategory):
165+
self.token = nextchar
166+
self.state = 'a'
167+
elif nextchar in self.quotes:
168+
if not self.posix:
169+
self.token = nextchar
170+
self.state = nextchar
171+
elif self.whitespace_split:
172+
self.token = nextchar
173+
self.state = 'a'
174+
else:
175+
self.token = nextchar
176+
if self.token or (self.posix and quoted):
177+
break # emit current token
178+
else:
179+
continue
180+
elif self.state in self.quotes:
181+
quoted = True
182+
if not nextchar: # end of file
183+
if self.debug >= 2:
184+
print "shlex: I see EOF in quotes state"
185+
# XXX what error should be raised here?
186+
raise ValueError, "No closing quotation"
187+
if nextchar == self.state:
188+
if not self.posix:
189+
self.token = self.token + nextchar
190+
self.state = ' '
191+
break
192+
else:
193+
self.state = 'a'
194+
elif self.posix and nextchar in self.escape and \
195+
self.state in self.escapedquotes:
196+
escapedstate = self.state
197+
self.state = nextchar
198+
else:
199+
self.token = self.token + nextchar
200+
elif self.state in self.escape:
201+
if not nextchar: # end of file
202+
if self.debug >= 2:
203+
print "shlex: I see EOF in escape state"
204+
# XXX what error should be raised here?
205+
raise ValueError, "No escaped character"
206+
# In posix shells, only the quote itself or the escape
207+
# character may be escaped within quotes.
208+
if escapedstate in self.quotes and \
209+
nextchar != self.state and nextchar != escapedstate:
210+
self.token = self.token + self.state
211+
self.token = self.token + nextchar
212+
self.state = escapedstate
213+
elif self.state == 'a':
214+
if not nextchar:
215+
self.state = None # end of file
216+
break
217+
if self.__is_whitespace(nextchar, nextcategory):
218+
if self.debug >= 2:
219+
print "shlex: I see whitespace in word state"
220+
self.state = ' '
221+
if self.token or (self.posix and quoted):
222+
break # emit current token
223+
else:
224+
continue
225+
elif nextchar in self.commenters:
226+
self.instream.readline()
227+
self.lineno = self.lineno + 1
228+
if self.posix:
229+
self.state = ' '
230+
if self.token or (self.posix and quoted):
231+
break # emit current token
232+
else:
233+
continue
234+
elif self.posix and nextchar in self.quotes:
235+
self.state = nextchar
236+
elif self.posix and nextchar in self.escape:
237+
escapedstate = 'a'
238+
self.state = nextchar
239+
elif self.__is_wordchar(nextchar, nextcategory) or nextchar in self.quotes \
240+
or self.whitespace_split:
241+
self.token = self.token + nextchar
242+
else:
243+
self.pushback.appendleft(nextchar)
244+
if self.debug >= 2:
245+
print "shlex: I see punctuation in word state"
246+
self.state = ' '
247+
if self.token:
248+
break # emit current token
249+
else:
250+
continue
251+
result = self.token
252+
self.token = ''
253+
if self.posix and not quoted and result == '':
254+
result = None
255+
if self.debug > 1:
256+
if result:
257+
print "shlex: raw token=" + repr(result)
258+
else:
259+
print "shlex: raw token=EOF"
260+
return result
261+
262+
def sourcehook(self, newfile, encoding='utf-8'):
263+
"Hook called on a filename to be sourced."
264+
from codecs import open
265+
if newfile[0] == '"':
266+
newfile = newfile[1:-1]
267+
# This implements cpp-like semantics for relative-path inclusion.
268+
if isinstance(self.infile, basestring) and not os.path.isabs(newfile):
269+
newfile = os.path.join(os.path.dirname(self.infile), newfile)
270+
return (newfile, open(newfile, "r", encoding))
271+
272+
def error_leader(self, infile=None, lineno=None):
273+
"Emit a C-compiler-like, Emacs-friendly error-message leader."
274+
if infile is None:
275+
infile = self.infile
276+
if lineno is None:
277+
lineno = self.lineno
278+
return "\"%s\", line %d: " % (infile, lineno)
279+
280+
def __iter__(self):
281+
return self
282+
283+
def next(self):
284+
token = self.get_token()
285+
if token == self.eof:
286+
raise StopIteration
287+
return token
288+
289+
def split(s, comments=False, posix=True):
290+
is_str = False
291+
if type(s) is str:
292+
s = unicode(s)
293+
is_str = True
294+
lex = shlex(s, posix=posix)
295+
lex.whitespace_split = True
296+
if not comments:
297+
lex.commenters = ''
298+
if is_str: return [ str(x) for x in list(lex) ]
299+
else: return list(lex)

advanced_new_file/reloader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
".lib",
2929
".lib.package_resources",
30+
".lib.ushlex",
3031

3132
".completions",
3233
'.completions.nix_completion',

0 commit comments

Comments
 (0)