pmacs3/method/utf8.py

import re
import unicodedata
from method import Method, Argument, arg

category_map = {
    'Lu': 'Letter, Uppercase',
    'Ll': 'Letter, Lowercase',
    'Lt': 'Letter, Titlecase',
    'Lm': 'Letter, Modifier',
    'Lo': 'Letter, Other',
    'Mn': 'Mark, Nonspacing',
    'Mc': 'Mark, Spacing Combining',
    'Me': 'Mark, Enclosing',
    'Nd': 'Number, Decimal Digit',
    'Nl': 'Number, Letter',
    'No': 'Number, Other',
    'Pc': 'Punctuation, Connector',
    'Pd': 'Punctuation, Dash',
    'Ps': 'Punctuation, Open',
    'Pe': 'Punctuation, Close',
    'Pi': 'Punctuation, Initial quote (may behave like Ps or Pe depending on usage)',
    'Pf': 'Punctuation, Final quote (may behave like Ps or Pe depending on usage)',
    'Po': 'Punctuation, Other',
    'Sm': 'Symbol, Math',
    'Sc': 'Symbol, Currency',
    'Sk': 'Symbol, Modifier',
    'So': 'Symbol, Other',
    'Zs': 'Separator, Space',
    'Zl': 'Separator, Line',
    'Zp': 'Separator, Paragraph',
    'Cc': 'Other, Control',
    'Cf': 'Other, Format',
    'Cs': 'Other, Surrogate',
    'Co': 'Other, Private Use',
    'Cn': 'Other, Not Assigned (no characters in the file have this property)',
}

bidirect_map = {
    'L':   'Left-to-Right',
    'LRE': 'Left-to-Right Embedding',
    'LRO': 'Left-to-Right Override',
    'R':   'Right-to-Left',
    'AL':  'Right-to-Left Arabic',
    'RLE': 'Right-to-Left Embedding',
    'RLO': 'Right-to-Left Override',
    'PDF': 'Pop Directional Format',
    'EN':  'European Number',
    'ES':  'European Number Separator',
    'ET':  'European Number Terminator',
    'AN':  'Arabic Number',
    'CS':  'Common Number Separator',
    'NSM': 'Nonspacing Mark',
    'BN':  'Boundary Neutral',
    'B':   'Paragraph Separator',
    'S':   'Segment Separator',
    'WS':  'Whitespace',
    'ON':  'Other Neutrals',
}

combine_map = {
    0:   'Spacing, split, enclosing, reordrant, and Tibetan subjoined',
    1:   'Overlays and interior',
    7:   'Nuktas',
    8:   'Hiragana/Katakana voicing marks',
    9:   'Viramas',
    10:  'Start of fixed position classes',
    199: 'End of fixed position classes',
    200: 'Below left attached',
    202: 'Below attached',
    204: 'Below right attached',
    208: 'Left attached (reordrant around single base character)',
    210: 'Right attached',
    212: 'Above left attached',
    214: 'Above attached',
    216: 'Above right attached',
    218: 'Below left',
    220: 'Below',
    222: 'Below right',
    224: 'Left (reordrant around single base character)',
    226: 'Right',
    228: 'Above right',
    230: 'Above',
    232: 'Above left',
    233: 'Double below',
    234: 'Double above',
    240: 'Below (iota subscript)',
}

width_map = {
    'W':  'East Asian Wide',
    'F':  'East Asian Full-width',
    'A':  'East Asian Ambiguous',
    'H':  'East Asian Half-width',
    'Na': 'East Asian Narrow',
    'N':  'Narrow',
}

def hex2(i):
    h = hex(i)[2:]
    if len(h) % 2 == 1:
        return '0' + h
    else:
        return h

def uniesc(i):
    return '\\x' + hex2(i)

def unichar(s):
    s = "u'" + s + "'"
    try:
        return eval(s, {}, {})
    except:
        return None

def unicodeget(u, fname, fallback):
    try:
        f = getattr(unicodedata, fname)
        value = f(u)
        if value:
            return value
    except:
        return fallback

class Utf8Describe(Method):
    '''get detailed utf-8 data about a particular utf-8 code point'''
    args = [arg("code", t=type(""), p="Code Point: ", h="UTF-8 code point to use")]
    cpt_re = re.compile('^\\u(?:[0-9a-fA-F]{2})+$')
    format = '''
Glyph          %s
Name           %s
Code           %s
Category       %s

Bidirectional  %s
Combining      %s
Width          %s
Mirroring      %s
Decomposition  %s

Decimal        %s
Digit          %s
Lookup         %s
Normalize      %s
Numeric        %s'''
    def _execute(self, w, **vargs):
        s = "u'" + vargs['code'] + "'"
        u = unichar(vargs['code'])
        if u is None:
            w.set_error("invalid: %s" % vargs['code'])
            return

        a = unicodeget(u, 'category', '??')
        b = unicodeget(u, 'bidirectional', '?')
        c = unicodeget(u, 'combining', '?')
        d = unicodeget(u, 'east_asian_width', '?')

        code     = repr(u)[2:-1]
        name     = unicodeget(u, 'name', 'Unnamed')
        category = category_map.get(a, 'No Category') + ' (%s)' % a
        bidirect = bidirect_map.get(b, 'No Directional Info') + ' (%s)' % b
        combine  = combine_map.get(c, 'No Combining Info') + ' (%s)' % c

        mirror = unicodeget(u, 'mirrored', 'Unknown Mirroring')
        width  = width_map.get(d, 'Unknown Width') + ' (%s)' % d

        decomposition = unicodeget(u, 'decomposition', 'No Decomposition Info')
        decimal       = unicodeget(u, 'decimal', 'n/a')
        digit         = unicodeget(u, 'digit', 'n/a')
        lookup        = unicodeget(u, 'lookup', 'n/a')
        normalize     = unicodeget(u, 'normalize', 'n/a')
        numeric       = unicodeget(u, 'numeric', 'n/a')
        
        data = self.format % (u, name, code, category, bidirect, combine, width,
                              mirror, decomposition, decimal, digit, lookup,
                              normalize, numeric)
        w.application.data_buffer('*Utf8-Info*', data.strip(), switch_to=True)

class Utf8DescribeChar(Utf8Describe):
    '''get utf-8 representation of the highlighted character'''
    args = []
    def _execute(self, w, **vargs):
        p = w.logical_cursor()
        u = w.buffer.get_substring(p, p.add(1, 0))
        Utf8Describe._execute(self, w, code=u)

class Utf8Query(Method):
    '''insert UTF-8 data into the buffer'''
    args = [arg("name", t=type(""), p="Glpyh Name: ", h="the name of the UTF-8 Glpyh")]
    def _execute(self, w, **vargs):
        name = vargs['name']
        try:
            u = unicodedata.lookup(name)
            w.set_error("glyph %s (%s)" % (u, repr(u)[2:-1]))
        except KeyError:
            w.set_error("glpyh %r was not found" % name)

class Utf8Insert(Method):
    '''insert UTF-8 data into the buffer'''
    args = [arg("data", t=type(""), p="UTF-8 Data: ", h="the UTF-8 data to use")]
    def _execute(self, w, **vargs):
        s = "u'" + vargs['data'] + "'"
        u = unichar(vargs['code'])
        if u is None:
            w.set_error("invalid: %s" % vargs['data'])
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`import re`
			`import unicodedata`
			`from method import Method, Argument, arg`
some basic utf-8 support --HG-- branch : pmacs2 2009-06-11 18:52:57 -04:00
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`category_map = {`
			`'Lu': 'Letter, Uppercase',`
			`'Ll': 'Letter, Lowercase',`
			`'Lt': 'Letter, Titlecase',`
			`'Lm': 'Letter, Modifier',`
			`'Lo': 'Letter, Other',`
			`'Mn': 'Mark, Nonspacing',`
			`'Mc': 'Mark, Spacing Combining',`
			`'Me': 'Mark, Enclosing',`
			`'Nd': 'Number, Decimal Digit',`
			`'Nl': 'Number, Letter',`
			`'No': 'Number, Other',`
			`'Pc': 'Punctuation, Connector',`
			`'Pd': 'Punctuation, Dash',`
			`'Ps': 'Punctuation, Open',`
			`'Pe': 'Punctuation, Close',`
			`'Pi': 'Punctuation, Initial quote (may behave like Ps or Pe depending on usage)',`
			`'Pf': 'Punctuation, Final quote (may behave like Ps or Pe depending on usage)',`
			`'Po': 'Punctuation, Other',`
			`'Sm': 'Symbol, Math',`
			`'Sc': 'Symbol, Currency',`
			`'Sk': 'Symbol, Modifier',`
			`'So': 'Symbol, Other',`
			`'Zs': 'Separator, Space',`
			`'Zl': 'Separator, Line',`
			`'Zp': 'Separator, Paragraph',`
			`'Cc': 'Other, Control',`
			`'Cf': 'Other, Format',`
			`'Cs': 'Other, Surrogate',`
			`'Co': 'Other, Private Use',`
			`'Cn': 'Other, Not Assigned (no characters in the file have this property)',`
			`}`
some basic utf-8 support --HG-- branch : pmacs2 2009-06-11 18:52:57 -04:00
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`bidirect_map = {`
			`'L': 'Left-to-Right',`
			`'LRE': 'Left-to-Right Embedding',`
			`'LRO': 'Left-to-Right Override',`
			`'R': 'Right-to-Left',`
			`'AL': 'Right-to-Left Arabic',`
			`'RLE': 'Right-to-Left Embedding',`
			`'RLO': 'Right-to-Left Override',`
			`'PDF': 'Pop Directional Format',`
			`'EN': 'European Number',`
			`'ES': 'European Number Separator',`
			`'ET': 'European Number Terminator',`
			`'AN': 'Arabic Number',`
			`'CS': 'Common Number Separator',`
			`'NSM': 'Nonspacing Mark',`
			`'BN': 'Boundary Neutral',`
			`'B': 'Paragraph Separator',`
			`'S': 'Segment Separator',`
			`'WS': 'Whitespace',`
			`'ON': 'Other Neutrals',`
			`}`
some basic utf-8 support --HG-- branch : pmacs2 2009-06-11 18:52:57 -04:00
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`combine_map = {`
			`0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined',`
			`1: 'Overlays and interior',`
			`7: 'Nuktas',`
			`8: 'Hiragana/Katakana voicing marks',`
			`9: 'Viramas',`
			`10: 'Start of fixed position classes',`
			`199: 'End of fixed position classes',`
			`200: 'Below left attached',`
			`202: 'Below attached',`
			`204: 'Below right attached',`
			`208: 'Left attached (reordrant around single base character)',`
			`210: 'Right attached',`
			`212: 'Above left attached',`
			`214: 'Above attached',`
			`216: 'Above right attached',`
			`218: 'Below left',`
			`220: 'Below',`
			`222: 'Below right',`
			`224: 'Left (reordrant around single base character)',`
			`226: 'Right',`
			`228: 'Above right',`
			`230: 'Above',`
			`232: 'Above left',`
			`233: 'Double below',`
			`234: 'Double above',`
			`240: 'Below (iota subscript)',`
			`}`

			`width_map = {`
			`'W': 'East Asian Wide',`
			`'F': 'East Asian Full-width',`
			`'A': 'East Asian Ambiguous',`
			`'H': 'East Asian Half-width',`
			`'Na': 'East Asian Narrow',`
			`'N': 'Narrow',`
			`}`

work for improvements --HG-- branch : pmacs2 2009-09-05 20:10:13 -04:00			`def hex2(i):`
			`h = hex(i)[2:]`
			`if len(h) % 2 == 1:`
			`return '0' + h`
			`else:`
			`return h`

			`def uniesc(i):`
			`return '\\x' + hex2(i)`

			`def unichar(s):`
			`s = "u'" + s + "'"`
			`try:`
			`return eval(s, {}, {})`
			`except:`
			`return None`

allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`def unicodeget(u, fname, fallback):`
			`try:`
			`f = getattr(unicodedata, fname)`
			`value = f(u)`
			`if value:`
			`return value`
			`except:`
			`return fallback`
some basic utf-8 support --HG-- branch : pmacs2 2009-06-11 18:52:57 -04:00
			`class Utf8Describe(Method):`
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`'''get detailed utf-8 data about a particular utf-8 code point'''`
			`args = [arg("code", t=type(""), p="Code Point: ", h="UTF-8 code point to use")]`
			`cpt_re = re.compile('^\\u(?:[0-9a-fA-F]{2})+$')`
			`format = '''`
			`Glyph %s`
			`Name %s`
more utf-8 improvements --HG-- branch : pmacs2 2009-06-12 10:53:32 -04:00			`Code %s`
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`Category %s`

			`Bidirectional %s`
			`Combining %s`
			`Width %s`
			`Mirroring %s`
			`Decomposition %s`

			`Decimal %s`
			`Digit %s`
			`Lookup %s`
			`Normalize %s`
			`Numeric %s'''`
			`def _execute(self, w, **vargs):`
			`s = "u'" + vargs['code'] + "'"`
work for improvements --HG-- branch : pmacs2 2009-09-05 20:10:13 -04:00			`u = unichar(vargs['code'])`
			`if u is None:`
			`w.set_error("invalid: %s" % vargs['code'])`
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`return`

			`a = unicodeget(u, 'category', '??')`
			`b = unicodeget(u, 'bidirectional', '?')`
			`c = unicodeget(u, 'combining', '?')`
			`d = unicodeget(u, 'east_asian_width', '?')`

more utf-8 improvements --HG-- branch : pmacs2 2009-06-12 10:53:32 -04:00			`code = repr(u)[2:-1]`
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`name = unicodeget(u, 'name', 'Unnamed')`
			`category = category_map.get(a, 'No Category') + ' (%s)' % a`
			`bidirect = bidirect_map.get(b, 'No Directional Info') + ' (%s)' % b`
			`combine = combine_map.get(c, 'No Combining Info') + ' (%s)' % c`

			`mirror = unicodeget(u, 'mirrored', 'Unknown Mirroring')`
			`width = width_map.get(d, 'Unknown Width') + ' (%s)' % d`

			`decomposition = unicodeget(u, 'decomposition', 'No Decomposition Info')`
			`decimal = unicodeget(u, 'decimal', 'n/a')`
			`digit = unicodeget(u, 'digit', 'n/a')`
			`lookup = unicodeget(u, 'lookup', 'n/a')`
			`normalize = unicodeget(u, 'normalize', 'n/a')`
			`numeric = unicodeget(u, 'numeric', 'n/a')`

more utf-8 improvements --HG-- branch : pmacs2 2009-06-12 10:53:32 -04:00			`data = self.format % (u, name, code, category, bidirect, combine, width,`
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`mirror, decomposition, decimal, digit, lookup,`
			`normalize, numeric)`
			`w.application.data_buffer('Utf8-Info', data.strip(), switch_to=True)`

			`class Utf8DescribeChar(Utf8Describe):`
			`'''get utf-8 representation of the highlighted character'''`
			`args = []`
some basic utf-8 support --HG-- branch : pmacs2 2009-06-11 18:52:57 -04:00			`def _execute(self, w, **vargs):`
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`p = w.logical_cursor()`
			`u = w.buffer.get_substring(p, p.add(1, 0))`
			`Utf8Describe._execute(self, w, code=u)`
some basic utf-8 support --HG-- branch : pmacs2 2009-06-11 18:52:57 -04:00
more utf-8 improvements --HG-- branch : pmacs2 2009-06-12 10:53:32 -04:00			`class Utf8Query(Method):`
			`'''insert UTF-8 data into the buffer'''`
			`args = [arg("name", t=type(""), p="Glpyh Name: ", h="the name of the UTF-8 Glpyh")]`
			`def _execute(self, w, **vargs):`
			`name = vargs['name']`
			`try:`
			`u = unicodedata.lookup(name)`
			`w.set_error("glyph %s (%s)" % (u, repr(u)[2:-1]))`
			`except KeyError:`
			`w.set_error("glpyh %r was not found" % name)`

some basic utf-8 support --HG-- branch : pmacs2 2009-06-11 18:52:57 -04:00			`class Utf8Insert(Method):`
allow utf-8 data in error msgs, etc --HG-- branch : pmacs2 2009-06-11 23:08:57 -04:00			`'''insert UTF-8 data into the buffer'''`
			`args = [arg("data", t=type(""), p="UTF-8 Data: ", h="the UTF-8 data to use")]`
some basic utf-8 support --HG-- branch : pmacs2 2009-06-11 18:52:57 -04:00			`def _execute(self, w, **vargs):`
			`s = "u'" + vargs['data'] + "'"`
work for improvements --HG-- branch : pmacs2 2009-09-05 20:10:13 -04:00			`u = unichar(vargs['code'])`
			`if u is None:`
some basic utf-8 support --HG-- branch : pmacs2 2009-06-11 18:52:57 -04:00			`w.set_error("invalid: %s" % vargs['data'])`