pmacs3/method/utf8.py

import re
import unicodedata
from method import Method, Argument, arg

category_map = {
    'Lu': 'Letter, Uppercase',
    'Ll': 'Letter, Lowercase',
    'Lt': 'Letter, Titlecase',
    'Lm': 'Letter, Modifier',
    'Lo': 'Letter, Other',
    'Mn': 'Mark, Nonspacing',
    'Mc': 'Mark, Spacing Combining',
    'Me': 'Mark, Enclosing',
    'Nd': 'Number, Decimal Digit',
    'Nl': 'Number, Letter',
    'No': 'Number, Other',
    'Pc': 'Punctuation, Connector',
    'Pd': 'Punctuation, Dash',
    'Ps': 'Punctuation, Open',
    'Pe': 'Punctuation, Close',
    'Pi': 'Punctuation, Initial quote (may behave like Ps or Pe depending on usage)',
    'Pf': 'Punctuation, Final quote (may behave like Ps or Pe depending on usage)',
    'Po': 'Punctuation, Other',
    'Sm': 'Symbol, Math',
    'Sc': 'Symbol, Currency',
    'Sk': 'Symbol, Modifier',
    'So': 'Symbol, Other',
    'Zs': 'Separator, Space',
    'Zl': 'Separator, Line',
    'Zp': 'Separator, Paragraph',
    'Cc': 'Other, Control',
    'Cf': 'Other, Format',
    'Cs': 'Other, Surrogate',
    'Co': 'Other, Private Use',
    'Cn': 'Other, Not Assigned (no characters in the file have this property)',
}

bidirect_map = {
    'L':   'Left-to-Right',
    'LRE': 'Left-to-Right Embedding',
    'LRO': 'Left-to-Right Override',
    'R':   'Right-to-Left',
    'AL':  'Right-to-Left Arabic',
    'RLE': 'Right-to-Left Embedding',
    'RLO': 'Right-to-Left Override',
    'PDF': 'Pop Directional Format',
    'EN':  'European Number',
    'ES':  'European Number Separator',
    'ET':  'European Number Terminator',
    'AN':  'Arabic Number',
    'CS':  'Common Number Separator',
    'NSM': 'Nonspacing Mark',
    'BN':  'Boundary Neutral',
    'B':   'Paragraph Separator',
    'S':   'Segment Separator',
    'WS':  'Whitespace',
    'ON':  'Other Neutrals',
}

combine_map = {
    0:   'Spacing, split, enclosing, reordrant, and Tibetan subjoined',
    1:   'Overlays and interior',
    7:   'Nuktas',
    8:   'Hiragana/Katakana voicing marks',
    9:   'Viramas',
    10:  'Start of fixed position classes',
    199: 'End of fixed position classes',
    200: 'Below left attached',
    202: 'Below attached',
    204: 'Below right attached',
    208: 'Left attached (reordrant around single base character)',
    210: 'Right attached',
    212: 'Above left attached',
    214: 'Above attached',
    216: 'Above right attached',
    218: 'Below left',
    220: 'Below',
    222: 'Below right',
    224: 'Left (reordrant around single base character)',
    226: 'Right',
    228: 'Above right',
    230: 'Above',
    232: 'Above left',
    233: 'Double below',
    234: 'Double above',
    240: 'Below (iota subscript)',
}

width_map = {
    'W':  'East Asian Wide',
    'F':  'East Asian Full-width',
    'A':  'East Asian Ambiguous',
    'H':  'East Asian Half-width',
    'Na': 'East Asian Narrow',
    'N':  'Narrow',
}

def hex2(i):
    h = hex(i)[2:]
    if len(h) % 2 == 1:
        return '0' + h
    else:
        return h

def uniesc(i):
    return '\\x' + hex2(i)

def unichar(s):
    s = "u'" + s + "'"
    try:
        return eval(s, {}, {})
    except:
        return None

def unicodeget(u, fname, fallback):
    try:
        f = getattr(unicodedata, fname)
        value = f(u)
        if value:
            return value
    except:
        return fallback

class Utf8Describe(Method):
    '''get detailed utf-8 data about a particular utf-8 code point'''
    args = [arg("code", t=type(""), p="Code Point: ", h="UTF-8 code point to use")]
    cpt_re = re.compile('^\\u(?:[0-9a-fA-F]{2})+$')
    format = '''
Glyph          %s
Name           %s
Code           %s
Category       %s

Bidirectional  %s
Combining      %s
Width          %s
Mirroring      %s
Decomposition  %s

Decimal        %s
Digit          %s
Lookup         %s
Normalize      %s
Numeric        %s'''
    def _execute(self, w, **vargs):
        s = "u'" + vargs['code'] + "'"
        u = unichar(vargs['code'])
        if u is None:
            w.set_error("invalid: %s" % vargs['code'])
            return

        a = unicodeget(u, 'category', '??')
        b = unicodeget(u, 'bidirectional', '?')
        c = unicodeget(u, 'combining', '?')
        d = unicodeget(u, 'east_asian_width', '?')

        code     = repr(u)[2:-1]
        name     = unicodeget(u, 'name', 'Unnamed')
        category = category_map.get(a, 'No Category') + ' (%s)' % a
        bidirect = bidirect_map.get(b, 'No Directional Info') + ' (%s)' % b
        combine  = combine_map.get(c, 'No Combining Info') + ' (%s)' % c

        mirror = unicodeget(u, 'mirrored', 'Unknown Mirroring')
        width  = width_map.get(d, 'Unknown Width') + ' (%s)' % d

        decomposition = unicodeget(u, 'decomposition', 'No Decomposition Info')
        decimal       = unicodeget(u, 'decimal', 'n/a')
        digit         = unicodeget(u, 'digit', 'n/a')
        lookup        = unicodeget(u, 'lookup', 'n/a')
        normalize     = unicodeget(u, 'normalize', 'n/a')
        numeric       = unicodeget(u, 'numeric', 'n/a')

        data = self.format % (u, name, code, category, bidirect, combine, width,
                              mirror, decomposition, decimal, digit, lookup,
                              normalize, numeric)
        w.application.data_buffer('*Utf8-Info*', data.strip(), switch_to=True)

class Utf8DescribeChar(Utf8Describe):
    '''get utf-8 representation of the highlighted character'''
    args = []
    def _execute(self, w, **vargs):
        p = w.logical_cursor()
        u = w.buffer.get_substring(p, p.add(1, 0))
        Utf8Describe._execute(self, w, code=u)

class Utf8Query(Method):
    '''insert UTF-8 data into the buffer'''
    args = [arg("name", t=type(""), p="Glpyh Name: ", h="the name of the UTF-8 Glpyh")]
    def _execute(self, w, **vargs):
        name = vargs['name']
        try:
            u = unicodedata.lookup(name)
            w.set_error("glyph %s (%s)" % (u, repr(u)[2:-1]))
        except KeyError:
            w.set_error("glpyh %r was not found" % name)

class Utf8Insert(Method):
    '''insert UTF-8 data into the buffer'''
    args = [arg("data", t=type(""), p="UTF-8 Data: ", h="the UTF-8 data to use")]
    def _execute(self, w, **vargs):
        s = "u'" + vargs['data'] + "'"
        u = unichar(vargs['code'])
        if u is None:
            w.set_error("invalid: %s" % vargs['data'])