diff options
Diffstat (limited to 'intl/uconv/tools/gen-big5-data.py')
-rw-r--r-- | intl/uconv/tools/gen-big5-data.py | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/intl/uconv/tools/gen-big5-data.py b/intl/uconv/tools/gen-big5-data.py new file mode 100644 index 0000000000..1d0f59bb43 --- /dev/null +++ b/intl/uconv/tools/gen-big5-data.py @@ -0,0 +1,253 @@ +#!/usr/bin/python + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Adapted from +# https://hg.mozilla.org/projects/htmlparser/file/0d906fb1ab90/generate-encoding-data.py + +# indexes.json comes from +# https://encoding.spec.whatwg.org/indexes.json +# i.e. +# https://github.com/whatwg/encoding/blob/ce4e83d0df5b5efec0697fc76e66699737e033a3/indexes.json + +import json + +indexes = json.load(open("indexes.json", "r")) + +def nullToZero(codePoint): + if not codePoint: + codePoint = 0 + return codePoint + +index = [] + +for codePoint in indexes["big5"]: + index.append(nullToZero(codePoint)) + +# There are four major gaps consisting of more than 4 consecutive invalid pointers +gaps = [] +consecutive = 0 +consecutiveStart = 0 +offset = 0 +for codePoint in index: + if codePoint == 0: + if consecutive == 0: + consecutiveStart = offset + consecutive +=1 + else: + if consecutive > 4: + gaps.append((consecutiveStart, consecutiveStart + consecutive)) + consecutive = 0 + offset += 1 + +def invertRanges(ranges, cap): + inverted = [] + invertStart = 0 + for (start, end) in ranges: + if start != 0: + inverted.append((invertStart, start)) + invertStart = end + inverted.append((invertStart, cap)) + return inverted + +cap = len(index) +ranges = invertRanges(gaps, cap) + +# Now compute a compressed lookup table for astralness + +gaps = [] +consecutive = 0 +consecutiveStart = 0 +offset = 0 +for codePoint in index: + if codePoint <= 0xFFFF: + if consecutive == 0: + consecutiveStart = offset + consecutive +=1 + else: + if consecutive > 40: + gaps.append((consecutiveStart, consecutiveStart + consecutive)) + consecutive = 0 + offset += 1 + +astralRanges = invertRanges(gaps, cap) + + +classFile = open("../ucvtw/nsBIG5Data.cpp", "w") +classFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. + * Instead, please regenerate using intl/uconv/tools/gen-big5-data.py + */ + +#include "nsBIG5Data.h" + +static const char16_t kBig5LowBitsTable[] = { +''') + +for (low, high) in ranges: + for i in xrange(low, high): + classFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF)) + +classFile.write('''}; + +static const uint32_t kBig5AstralnessTable[] = { +''') + +# An array of bool is inefficient per +# http://stackoverflow.com/questions/4049156/1-bit-per-bool-in-array-c + +bits = [] +for (low, high) in astralRanges: + for i in xrange(low, high): + bits.append(1 if index[i] > 0xFFFF else 0) +# pad length to multiple of 32 +for i in xrange(32 - (len(bits) % 32)): + bits.append(0) +i = 0 +while i < len(bits): + accu = 0 + for j in xrange(32): + accu |= bits[i + j] << j + classFile.write(' 0x%08X,\n' % accu) + i += 32 + +classFile.write('''}; + +// static +char16_t +nsBIG5Data::LowBits(size_t aPointer) +{ +''') + +base = 0 +for (low, high) in ranges: + classFile.write(''' if (aPointer < %d) { + return 0; + } + if (aPointer < %d) { + return kBig5LowBitsTable[%d + (aPointer - %d)]; + } +''' % (low, high, base, low)) + base += (high - low) + +classFile.write(''' return 0; +} + +// static +bool +nsBIG5Data::IsAstral(size_t aPointer) +{ +''') + +base = 0 +for (low, high) in astralRanges: + if high - low == 1: + classFile.write(''' if (aPointer < %d) { + return false; + } + if (aPointer == %d) { + return true; + } +''' % (low, low)) + else: + classFile.write(''' if (aPointer < %d) { + return false; + } + if (aPointer < %d) { + size_t index = %d + (aPointer - %d); + return kBig5AstralnessTable[index >> 5] & (1 << (index & 0x1F)); + } +''' % (low, high, base, low)) + base += (high - low) + +classFile.write(''' return false; +} + +//static +size_t +nsBIG5Data::FindPointer(char16_t aLowBits, bool aIsAstral) +{ + if (!aIsAstral) { + switch (aLowBits) { +''') + +hkscsBound = (0xA1 - 0x81) * 157 + +preferLast = [ + 0x2550, + 0x255E, + 0x2561, + 0x256A, + 0x5341, + 0x5345, +] + +for codePoint in preferLast: + # Python lists don't have .rindex() :-( + for i in xrange(len(index) - 1, -1, -1): + candidate = index[i] + if candidate == codePoint: + classFile.write(''' case 0x%04X: + return %d; +''' % (codePoint, i)) + break + +classFile.write(''' default: + break; + } + }''') + +base = 0 +start = 0 +for (low, high) in ranges: + if low <= hkscsBound and hkscsBound < high: + # This is the first range we don't ignore and the + # range that contains the first non-HKSCS pointer. + # Avoid searching HKSCS. + start = base + hkscsBound - low + break + base += (high - low) + +classFile.write(''' + for (size_t i = %d; i < MOZ_ARRAY_LENGTH(kBig5LowBitsTable); ++i) { + if (kBig5LowBitsTable[i] == aLowBits) { + size_t pointer; + ''' % start) + +base = 0 +prevLow = 0 +prevHigh = 0 +prevBase = 0 +writing = False +for (low, high) in ranges: + if writing: + classFile.write('''if (i < %d) { + pointer = i + %d; + } else ''' % ((prevBase + prevHigh - prevLow), (prevLow - prevBase))) + prevLow = low + prevHigh = high + prevBase = base + if high > hkscsBound: + writing = True + base += (high - low) + +classFile.write('''{ + pointer = i + %d; + }''' % (prevLow - prevBase)) + +classFile.write(''' + if (aIsAstral == IsAstral(pointer)) { + return pointer; + } + } + } + return 0; +} +''') +classFile.close() |