From 03ab1d39840c461e60ff6d24aa63e60ed1f5158c Mon Sep 17 00:00:00 2001
From: Erik Osheim <erik@osheim.org>
Date: Mon, 1 Mar 2010 01:20:31 -0500
Subject: [PATCH] some buffer cleanup... better encoding support

--HG--
branch : pmacs2
---
 buffer/__init__.py | 94 ++++++++++++++++++++++++++--------------------
 method/__init__.py | 12 ++++++
 2 files changed, 65 insertions(+), 41 deletions(-)

diff --git a/buffer/__init__.py b/buffer/__init__.py
index 4770ff0..d596e99 100644
--- a/buffer/__init__.py
+++ b/buffer/__init__.py
@@ -1,6 +1,7 @@
 from util import defaultdict
 import codecs, datetime, grp, os, pwd, re, shutil, stat, string
 import fcntl, select, pty, threading
+import chardet
 #import aes, dirutil, regex, highlight, lex, term
 import dirutil, regex, highlight, lex, term
 from point import Point
@@ -8,10 +9,11 @@ from subprocess import Popen, PIPE, STDOUT
 from keyinput import MAP
 
 # undo/redo stack constants
-ACT_NONE    = -1
-ACT_NORM    = 0
-ACT_UNDO    = 1
-ACT_REDO    = 2
+ACT_NONE = 'none'
+ACT_NORM = 'norm'
+ACT_UNDO = 'undo'
+ACT_REDO = 'redo'
+
 STACK_LIMIT = 1024
 
 def hasher(data):
@@ -30,12 +32,12 @@ class FileGoneError(Exception): pass
 # used for undo/redo stacks when text will need to be added back
 class AddMove(object):
     def __init__(self, buffer, p, lines):
-        self.buffer = buffer
-        self.p = p
-        self.lines = lines
+        self.buffer  = buffer
+        self.p       = p
+        self.lines   = lines
         self.undo_id = buffer.undo_id
-    def restore(self, act=ACT_UNDO):
-        assert act == ACT_UNDO or act == ACT_REDO
+    def restore(self, act):
+        assert act in (ACT_UNDO, ACT_REDO)
         self.buffer.insert_lines(self.p, self.lines, act)
     def getpos(self):
         return self.p
@@ -43,12 +45,12 @@ class AddMove(object):
 # used for undo/redo stacks when text will need to be removed
 class DelMove(object):
     def __init__(self, buffer, p1, p2):
-        self.buffer = buffer
-        self.p1     = p1
-        self.p2     = p2
+        self.buffer  = buffer
+        self.p1      = p1
+        self.p2      = p2
         self.undo_id = buffer.undo_id
     def restore(self, act):
-        assert act == ACT_UNDO or act == ACT_REDO
+        assert act in (ACT_UNDO, ACT_REDO)
         self.buffer.delete(self.p1, self.p2, act)
     def getpos(self):
         return self.p1
@@ -76,9 +78,20 @@ class Buffer(object):
         self.metadata    = {}
 
     def _detect_nl_type(self, data):
-        mac_c  = len(self.mac_re.findall(data))
-        unix_c = len(self.unix_re.findall(data))
-        win_c  = len(self.win_re.findall(data))
+        mac_c = unix_c = win_c = 0
+        l = len(data)
+        i = 0
+        while i < l:
+            if data[i:i + 2] == '\r\n':
+                win_c += 1
+                i += 2
+            else:
+                if data[i] == '\n':
+                    unix_c += 1
+                elif data[i] == '\r':
+                    mac_c += 1
+                i += 1
+
         if (unix_c and mac_c) or (unix_c and win_c) or (mac_c and win_c):
             # warn the user?
             pass
@@ -264,6 +277,12 @@ class Buffer(object):
 
         # the file has not been modified now
         self.modified = False
+    def backup(self):
+        '''backup path, and return the path to the temporary backup file'''
+        tf, tpath = tempfile.mkstemp(prefix='pmc')
+        tf.write(open(self.path, 'rb').read())
+        tf.close()
+        return tpath
     def readonly(self):
         return False
     def read_filter(self, data):
@@ -560,18 +579,12 @@ class FileBuffer(Buffer):
     def __init__(self, path, name=None):
         '''fb = FileBuffer(path)'''
         Buffer.__init__(self)
-        self.path     = os.path.realpath(path)
-        self.checksum = None
-        self.bytemark = ''
-        self.codec    = 'utf-8'
-        if name is None:
-            self._name = os.path.basename(self.path)
-        else:
-            self._name = name
-        if os.path.exists(self.path) and not os.access(self.path, os.W_OK):
-            self._readonly = True
-        else:
-            self._readonly = False
+        self.path      = os.path.realpath(path)
+        self.checksum  = None
+        self.codec     = 'utf-8'
+        self._name     = name or os.path.basename(path)
+        self._readonly = os.path.exists(path) and not os.access(path, os.W_OK)
+
     def readonly(self):
         return self._readonly
 
@@ -624,30 +637,29 @@ class FileBuffer(Buffer):
         if self.path_exists():
             f = self._open_file_r()
             data = f.read()
-            if '\t' in data:
-                self.writetabs = True
-            
             f.close()
             self.store_checksum(data)
+            self.codec = chardet.detect(data)['encoding'].lower()
         else:
             data = ''
+            self.codec = 'utf-8'
 
-        if data.startswith('\xEF\xBB\xBF'):
-            # utf-8 bytemark
-            self.bytemark = data[:3]
-            data          = data[3:]
+        if self.codec == 'utf-8' and data.startswith(codecs.BOM_UTF8):
+            self.codec = 'utf-8-sig'
+        elif self.codec.startswith('utf-16'):
+            self.codec = 'utf-16'
 
+        data = data.decode(self.codec)
+
+        if '\t' in data: self.writetabs = True
         self.nl = self._detect_nl_type(data)
+
         data = self.read_filter(data)
 
         if '\x00' in data[:8192]:
             raise BinaryDataException("binary files are not supported")
 
-        for codec in ('utf-8', 'latin-1'):
-            data2 = self.decode(data, codec)
-            if data2 is not None: return data2
-
-        raise BinaryDataException("binary files are not supported")
+        return data
 
     def open(self):
         data = self.read()
@@ -687,7 +699,7 @@ class FileBuffer(Buffer):
             data = self.write_filter(data.encode(self.codec))
     
             f2 = self._open_file_w(self.path, preserve=False)
-            f2.write(self.bytemark + data)
+            f2.write(data)
             f2.close()
         #except Exception, e:
         except NameError, e:
diff --git a/method/__init__.py b/method/__init__.py
index 6a82660..ef564ad 100644
--- a/method/__init__.py
+++ b/method/__init__.py
@@ -1,3 +1,4 @@
+import codecs
 import os, commands, re, tempfile
 from subprocess import Popen, PIPE, STDOUT
 
@@ -1151,3 +1152,14 @@ class SetTokenColors(Method):
             a.cached_colors = {}
             a.token_colors[name] = colors
             w.set_error('Color for %s set to %r' % (name, colors))
+
+class SetCodec(Method):
+    args = [arg('codec', p='Codec: ', h='')]
+    def _execute(self, w, **vargs):
+        codec = vargs['codec']
+        try:
+            codecs.lookup(codec)
+            w.buffer.codec = codec
+            w.set_error('setting %r encoding to %r' % (w.buffer.name(), codec))
+        except LookupError:
+            w.set_error('Codec %r was not found' % codec)