pmacs3/method/utf8.py

205 lines
6.2 KiB
Python

import re
import unicodedata
from method import Method, Argument, arg
category_map = {
'Lu': 'Letter, Uppercase',
'Ll': 'Letter, Lowercase',
'Lt': 'Letter, Titlecase',
'Lm': 'Letter, Modifier',
'Lo': 'Letter, Other',
'Mn': 'Mark, Nonspacing',
'Mc': 'Mark, Spacing Combining',
'Me': 'Mark, Enclosing',
'Nd': 'Number, Decimal Digit',
'Nl': 'Number, Letter',
'No': 'Number, Other',
'Pc': 'Punctuation, Connector',
'Pd': 'Punctuation, Dash',
'Ps': 'Punctuation, Open',
'Pe': 'Punctuation, Close',
'Pi': 'Punctuation, Initial quote (may behave like Ps or Pe depending on usage)',
'Pf': 'Punctuation, Final quote (may behave like Ps or Pe depending on usage)',
'Po': 'Punctuation, Other',
'Sm': 'Symbol, Math',
'Sc': 'Symbol, Currency',
'Sk': 'Symbol, Modifier',
'So': 'Symbol, Other',
'Zs': 'Separator, Space',
'Zl': 'Separator, Line',
'Zp': 'Separator, Paragraph',
'Cc': 'Other, Control',
'Cf': 'Other, Format',
'Cs': 'Other, Surrogate',
'Co': 'Other, Private Use',
'Cn': 'Other, Not Assigned (no characters in the file have this property)',
}
bidirect_map = {
'L': 'Left-to-Right',
'LRE': 'Left-to-Right Embedding',
'LRO': 'Left-to-Right Override',
'R': 'Right-to-Left',
'AL': 'Right-to-Left Arabic',
'RLE': 'Right-to-Left Embedding',
'RLO': 'Right-to-Left Override',
'PDF': 'Pop Directional Format',
'EN': 'European Number',
'ES': 'European Number Separator',
'ET': 'European Number Terminator',
'AN': 'Arabic Number',
'CS': 'Common Number Separator',
'NSM': 'Nonspacing Mark',
'BN': 'Boundary Neutral',
'B': 'Paragraph Separator',
'S': 'Segment Separator',
'WS': 'Whitespace',
'ON': 'Other Neutrals',
}
combine_map = {
0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined',
1: 'Overlays and interior',
7: 'Nuktas',
8: 'Hiragana/Katakana voicing marks',
9: 'Viramas',
10: 'Start of fixed position classes',
199: 'End of fixed position classes',
200: 'Below left attached',
202: 'Below attached',
204: 'Below right attached',
208: 'Left attached (reordrant around single base character)',
210: 'Right attached',
212: 'Above left attached',
214: 'Above attached',
216: 'Above right attached',
218: 'Below left',
220: 'Below',
222: 'Below right',
224: 'Left (reordrant around single base character)',
226: 'Right',
228: 'Above right',
230: 'Above',
232: 'Above left',
233: 'Double below',
234: 'Double above',
240: 'Below (iota subscript)',
}
width_map = {
'W': 'East Asian Wide',
'F': 'East Asian Full-width',
'A': 'East Asian Ambiguous',
'H': 'East Asian Half-width',
'Na': 'East Asian Narrow',
'N': 'Narrow',
}
def hex2(i):
h = hex(i)[2:]
if len(h) % 2 == 1:
return '0' + h
else:
return h
def uniesc(i):
return '\\x' + hex2(i)
def unichar(s):
s = "u'" + s + "'"
try:
return eval(s, {}, {})
except:
return None
def unicodeget(u, fname, fallback):
try:
f = getattr(unicodedata, fname)
value = f(u)
if value:
return value
except:
return fallback
class Utf8Describe(Method):
'''get detailed utf-8 data about a particular utf-8 code point'''
args = [arg("code", t=type(""), p="Code Point: ", h="UTF-8 code point to use")]
cpt_re = re.compile('^\\u(?:[0-9a-fA-F]{2})+$')
format = '''
Glyph %s
Name %s
Code %s
Category %s
Bidirectional %s
Combining %s
Width %s
Mirroring %s
Decomposition %s
Decimal %s
Digit %s
Lookup %s
Normalize %s
Numeric %s'''
def _execute(self, w, **vargs):
s = "u'" + vargs['code'] + "'"
u = unichar(vargs['code'])
if u is None:
w.set_error("invalid: %s" % vargs['code'])
return
a = unicodeget(u, 'category', '??')
b = unicodeget(u, 'bidirectional', '?')
c = unicodeget(u, 'combining', '?')
d = unicodeget(u, 'east_asian_width', '?')
code = repr(u)[2:-1]
name = unicodeget(u, 'name', 'Unnamed')
category = category_map.get(a, 'No Category') + ' (%s)' % a
bidirect = bidirect_map.get(b, 'No Directional Info') + ' (%s)' % b
combine = combine_map.get(c, 'No Combining Info') + ' (%s)' % c
mirror = unicodeget(u, 'mirrored', 'Unknown Mirroring')
width = width_map.get(d, 'Unknown Width') + ' (%s)' % d
decomposition = unicodeget(u, 'decomposition', 'No Decomposition Info')
decimal = unicodeget(u, 'decimal', 'n/a')
digit = unicodeget(u, 'digit', 'n/a')
lookup = unicodeget(u, 'lookup', 'n/a')
normalize = unicodeget(u, 'normalize', 'n/a')
numeric = unicodeget(u, 'numeric', 'n/a')
data = self.format % (u, name, code, category, bidirect, combine, width,
mirror, decomposition, decimal, digit, lookup,
normalize, numeric)
w.application.data_buffer('*Utf8-Info*', data.strip(), switch_to=True)
class Utf8DescribeChar(Utf8Describe):
'''get utf-8 representation of the highlighted character'''
args = []
def _execute(self, w, **vargs):
p = w.logical_cursor()
u = w.buffer.get_substring(p, p.add(1, 0))
Utf8Describe._execute(self, w, code=u)
class Utf8Query(Method):
'''insert UTF-8 data into the buffer'''
args = [arg("name", t=type(""), p="Glpyh Name: ", h="the name of the UTF-8 Glpyh")]
def _execute(self, w, **vargs):
name = vargs['name']
try:
u = unicodedata.lookup(name)
w.set_error("glyph %s (%s)" % (u, repr(u)[2:-1]))
except KeyError:
w.set_error("glpyh %r was not found" % name)
class Utf8Insert(Method):
'''insert UTF-8 data into the buffer'''
args = [arg("data", t=type(""), p="UTF-8 Data: ", h="the UTF-8 data to use")]
def _execute(self, w, **vargs):
s = "u'" + vargs['data'] + "'"
u = unichar(vargs['code'])
if u is None:
w.set_error("invalid: %s" % vargs['data'])