From 03ab1d39840c461e60ff6d24aa63e60ed1f5158c Mon Sep 17 00:00:00 2001 From: Erik Osheim Date: Mon, 1 Mar 2010 01:20:31 -0500 Subject: [PATCH] some buffer cleanup... better encoding support --HG-- branch : pmacs2 --- buffer/__init__.py | 94 ++++++++++++++++++++++++++-------------------- method/__init__.py | 12 ++++++ 2 files changed, 65 insertions(+), 41 deletions(-) diff --git a/buffer/__init__.py b/buffer/__init__.py index 4770ff0..d596e99 100644 --- a/buffer/__init__.py +++ b/buffer/__init__.py @@ -1,6 +1,7 @@ from util import defaultdict import codecs, datetime, grp, os, pwd, re, shutil, stat, string import fcntl, select, pty, threading +import chardet #import aes, dirutil, regex, highlight, lex, term import dirutil, regex, highlight, lex, term from point import Point @@ -8,10 +9,11 @@ from subprocess import Popen, PIPE, STDOUT from keyinput import MAP # undo/redo stack constants -ACT_NONE = -1 -ACT_NORM = 0 -ACT_UNDO = 1 -ACT_REDO = 2 +ACT_NONE = 'none' +ACT_NORM = 'norm' +ACT_UNDO = 'undo' +ACT_REDO = 'redo' + STACK_LIMIT = 1024 def hasher(data): @@ -30,12 +32,12 @@ class FileGoneError(Exception): pass # used for undo/redo stacks when text will need to be added back class AddMove(object): def __init__(self, buffer, p, lines): - self.buffer = buffer - self.p = p - self.lines = lines + self.buffer = buffer + self.p = p + self.lines = lines self.undo_id = buffer.undo_id - def restore(self, act=ACT_UNDO): - assert act == ACT_UNDO or act == ACT_REDO + def restore(self, act): + assert act in (ACT_UNDO, ACT_REDO) self.buffer.insert_lines(self.p, self.lines, act) def getpos(self): return self.p @@ -43,12 +45,12 @@ class AddMove(object): # used for undo/redo stacks when text will need to be removed class DelMove(object): def __init__(self, buffer, p1, p2): - self.buffer = buffer - self.p1 = p1 - self.p2 = p2 + self.buffer = buffer + self.p1 = p1 + self.p2 = p2 self.undo_id = buffer.undo_id def restore(self, act): - assert act == ACT_UNDO or act == ACT_REDO + assert act in (ACT_UNDO, ACT_REDO) self.buffer.delete(self.p1, self.p2, act) def getpos(self): return self.p1 @@ -76,9 +78,20 @@ class Buffer(object): self.metadata = {} def _detect_nl_type(self, data): - mac_c = len(self.mac_re.findall(data)) - unix_c = len(self.unix_re.findall(data)) - win_c = len(self.win_re.findall(data)) + mac_c = unix_c = win_c = 0 + l = len(data) + i = 0 + while i < l: + if data[i:i + 2] == '\r\n': + win_c += 1 + i += 2 + else: + if data[i] == '\n': + unix_c += 1 + elif data[i] == '\r': + mac_c += 1 + i += 1 + if (unix_c and mac_c) or (unix_c and win_c) or (mac_c and win_c): # warn the user? pass @@ -264,6 +277,12 @@ class Buffer(object): # the file has not been modified now self.modified = False + def backup(self): + '''backup path, and return the path to the temporary backup file''' + tf, tpath = tempfile.mkstemp(prefix='pmc') + tf.write(open(self.path, 'rb').read()) + tf.close() + return tpath def readonly(self): return False def read_filter(self, data): @@ -560,18 +579,12 @@ class FileBuffer(Buffer): def __init__(self, path, name=None): '''fb = FileBuffer(path)''' Buffer.__init__(self) - self.path = os.path.realpath(path) - self.checksum = None - self.bytemark = '' - self.codec = 'utf-8' - if name is None: - self._name = os.path.basename(self.path) - else: - self._name = name - if os.path.exists(self.path) and not os.access(self.path, os.W_OK): - self._readonly = True - else: - self._readonly = False + self.path = os.path.realpath(path) + self.checksum = None + self.codec = 'utf-8' + self._name = name or os.path.basename(path) + self._readonly = os.path.exists(path) and not os.access(path, os.W_OK) + def readonly(self): return self._readonly @@ -624,30 +637,29 @@ class FileBuffer(Buffer): if self.path_exists(): f = self._open_file_r() data = f.read() - if '\t' in data: - self.writetabs = True - f.close() self.store_checksum(data) + self.codec = chardet.detect(data)['encoding'].lower() else: data = '' + self.codec = 'utf-8' - if data.startswith('\xEF\xBB\xBF'): - # utf-8 bytemark - self.bytemark = data[:3] - data = data[3:] + if self.codec == 'utf-8' and data.startswith(codecs.BOM_UTF8): + self.codec = 'utf-8-sig' + elif self.codec.startswith('utf-16'): + self.codec = 'utf-16' + data = data.decode(self.codec) + + if '\t' in data: self.writetabs = True self.nl = self._detect_nl_type(data) + data = self.read_filter(data) if '\x00' in data[:8192]: raise BinaryDataException("binary files are not supported") - for codec in ('utf-8', 'latin-1'): - data2 = self.decode(data, codec) - if data2 is not None: return data2 - - raise BinaryDataException("binary files are not supported") + return data def open(self): data = self.read() @@ -687,7 +699,7 @@ class FileBuffer(Buffer): data = self.write_filter(data.encode(self.codec)) f2 = self._open_file_w(self.path, preserve=False) - f2.write(self.bytemark + data) + f2.write(data) f2.close() #except Exception, e: except NameError, e: diff --git a/method/__init__.py b/method/__init__.py index 6a82660..ef564ad 100644 --- a/method/__init__.py +++ b/method/__init__.py @@ -1,3 +1,4 @@ +import codecs import os, commands, re, tempfile from subprocess import Popen, PIPE, STDOUT @@ -1151,3 +1152,14 @@ class SetTokenColors(Method): a.cached_colors = {} a.token_colors[name] = colors w.set_error('Color for %s set to %r' % (name, colors)) + +class SetCodec(Method): + args = [arg('codec', p='Codec: ', h='')] + def _execute(self, w, **vargs): + codec = vargs['codec'] + try: + codecs.lookup(codec) + w.buffer.codec = codec + w.set_error('setting %r encoding to %r' % (w.buffer.name(), codec)) + except LookupError: + w.set_error('Codec %r was not found' % codec)