local lpeg = lpeg or require'lpeg' local utf16be_to_utf8 = lpeg.Cs(( lpeg.R('\x00\xD7', '\xE0\xFF') * 1 / function(s) local high, low = string.byte(s, 1, 2) return utf8.char(high << 8 | low) end + lpeg.R'\xD8\xDB' * 1 * lpeg.R'\xDC\xDF' * 1 / function(s) local hh, hl, lh, ll = string.byte(s, 1, 4) return utf8.char(((hh & 3) << 12 | hl << 10 | (lh & 3) << 8 | ll) + 0x10000) end + lpeg.Cg(2 * lpeg.Cc('\u{FFFD}')) )^0) * -1 local function utf8cp_to_utf16be(utf8char) local codepoint = utf8.codepoint(utf8char) if codepoint < 0x10000 then return string.char(codepoint >> 8, codepoint & 0xFF) else codepoint = codepoint - 0x10000 local high = (codepoint >> 10) | 0xD800 local low = (codepoint & 0x3FF) | 0xDC00 return string.char(high >> 8, high & 0xFF, low >> 8, low & 0xFF) end end local pdfdoc_mapping = { ['\x18'] = '\u{02D8}', ['\x19'] = '\u{02C7}', ['\x1A'] = '\u{02C6}', ['\x1B'] = '\u{02D9}', ['\x1C'] = '\u{02DD}', ['\x1D'] = '\u{02DB}', ['\x1E'] = '\u{02DA}', ['\x1F'] = '\u{02DC}', ['\x7F'] = '\u{FFFD}', ['\x80'] = '\u{2022}', ['\x81'] = '\u{2020}', ['\x82'] = '\u{2021}', ['\x83'] = '\u{2026}', ['\x84'] = '\u{2014}', ['\x85'] = '\u{2013}', ['\x86'] = '\u{0192}', ['\x87'] = '\u{2044}', ['\x88'] = '\u{2039}', ['\x89'] = '\u{203A}', ['\x8A'] = '\u{2212}', ['\x8B'] = '\u{2030}', ['\x8C'] = '\u{201E}', ['\x8D'] = '\u{201C}', ['\x8E'] = '\u{201D}', ['\x8F'] = '\u{2018}', ['\x90'] = '\u{2019}', ['\x91'] = '\u{201A}', ['\x92'] = '\u{2122}', ['\x93'] = '\u{FB01}', ['\x94'] = '\u{FB02}', ['\x95'] = '\u{0141}', ['\x96'] = '\u{0152}', ['\x97'] = '\u{0160}', ['\x98'] = '\u{0178}', ['\x99'] = '\u{017D}', ['\x9A'] = '\u{0131}', ['\x9B'] = '\u{0142}', ['\x9C'] = '\u{0153}', ['\x9D'] = '\u{0161}', ['\x9E'] = '\u{017E}', ['\x9F'] = '\u{FFFD}', ['\xA0'] = '\u{20AC}', ['\xAD'] = '\u{FFFD}', } local pdfdoc_to_utf8 = lpeg.Cs(( lpeg.R('\x00\x17', '\x0D\x0D', '\x20\x7E') + lpeg.R('\xA1\xAC', '\xAE\xFF') / function(c) return utf8.char(string.byte(c)) end + lpeg.R('\x18\x1F', '\x7F\xA0', '\xAD\xAD') / pdfdoc_mapping )^0) * -1 local text_string_to_utf8 = '\xFE\xFF' * utf16be_to_utf8 + '\u{FEFF}' * lpeg.C(lpeg.P(1)^0) * -1 + pdfdoc_to_utf8 local winansi_mapping = { ['\x80'] = '\u{20AC}', ['\x81'] = '\u{0081}', ['\x82'] = '\u{201A}', ['\x83'] = '\u{0192}', ['\x84'] = '\u{201E}', ['\x85'] = '\u{2026}', ['\x86'] = '\u{2020}', ['\x87'] = '\u{2021}', ['\x88'] = '\u{02C6}', ['\x89'] = '\u{2030}', ['\x8A'] = '\u{0160}', ['\x8B'] = '\u{2039}', ['\x8C'] = '\u{0152}', ['\x8D'] = '\u{008D}', ['\x8E'] = '\u{017D}', ['\x8F'] = '\u{008F}', ['\x90'] = '\u{0090}', ['\x91'] = '\u{2018}', ['\x92'] = '\u{2019}', ['\x93'] = '\u{201C}', ['\x94'] = '\u{201D}', ['\x95'] = '\u{2022}', ['\x96'] = '\u{2013}', ['\x97'] = '\u{2014}', ['\x98'] = '\u{02DC}', ['\x99'] = '\u{2122}', ['\x9A'] = '\u{0161}', ['\x9B'] = '\u{203A}', ['\x9C'] = '\u{0153}', ['\x9D'] = '\u{009D}', ['\x9E'] = '\u{017E}', ['\x9F'] = '\u{0178}', ['\xA0'] = '\u{00A0}', ['\xA1'] = '\u{00A1}', ['\xA2'] = '\u{00A2}', ['\xA3'] = '\u{00A3}', ['\xA4'] = '\u{00A4}', ['\xA5'] = '\u{00A5}', ['\xA6'] = '\u{00A6}', ['\xA7'] = '\u{00A7}', ['\xA8'] = '\u{00A8}', ['\xA9'] = '\u{00A9}', ['\xAA'] = '\u{00AA}', ['\xAB'] = '\u{00AB}', ['\xAC'] = '\u{00AC}', ['\xAD'] = '\u{00AD}', ['\xAE'] = '\u{00AE}', ['\xAF'] = '\u{00AF}', ['\xB0'] = '\u{00B0}', ['\xB1'] = '\u{00B1}', ['\xB2'] = '\u{00B2}', ['\xB3'] = '\u{00B3}', ['\xB4'] = '\u{00B4}', ['\xB5'] = '\u{00B5}', ['\xB6'] = '\u{00B6}', ['\xB7'] = '\u{00B7}', ['\xB8'] = '\u{00B8}', ['\xB9'] = '\u{00B9}', ['\xBA'] = '\u{00BA}', ['\xBB'] = '\u{00BB}', ['\xBC'] = '\u{00BC}', ['\xBD'] = '\u{00BD}', ['\xBE'] = '\u{00BE}', ['\xBF'] = '\u{00BF}', ['\xC0'] = '\u{00C0}', ['\xC1'] = '\u{00C1}', ['\xC2'] = '\u{00C2}', ['\xC3'] = '\u{00C3}', ['\xC4'] = '\u{00C4}', ['\xC5'] = '\u{00C5}', ['\xC6'] = '\u{00C6}', ['\xC7'] = '\u{00C7}', ['\xC8'] = '\u{00C8}', ['\xC9'] = '\u{00C9}', ['\xCA'] = '\u{00CA}', ['\xCB'] = '\u{00CB}', ['\xCC'] = '\u{00CC}', ['\xCD'] = '\u{00CD}', ['\xCE'] = '\u{00CE}', ['\xCF'] = '\u{00CF}', ['\xD0'] = '\u{00D0}', ['\xD1'] = '\u{00D1}', ['\xD2'] = '\u{00D2}', ['\xD3'] = '\u{00D3}', ['\xD4'] = '\u{00D4}', ['\xD5'] = '\u{00D5}', ['\xD6'] = '\u{00D6}', ['\xD7'] = '\u{00D7}', ['\xD8'] = '\u{00D8}', ['\xD9'] = '\u{00D9}', ['\xDA'] = '\u{00DA}', ['\xDB'] = '\u{00DB}', ['\xDC'] = '\u{00DC}', ['\xDD'] = '\u{00DD}', ['\xDE'] = '\u{00DE}', ['\xDF'] = '\u{00DF}', ['\xE0'] = '\u{00E0}', ['\xE1'] = '\u{00E1}', ['\xE2'] = '\u{00E2}', ['\xE3'] = '\u{00E3}', ['\xE4'] = '\u{00E4}', ['\xE5'] = '\u{00E5}', ['\xE6'] = '\u{00E6}', ['\xE7'] = '\u{00E7}', ['\xE8'] = '\u{00E8}', ['\xE9'] = '\u{00E9}', ['\xEA'] = '\u{00EA}', ['\xEB'] = '\u{00EB}', ['\xEC'] = '\u{00EC}', ['\xED'] = '\u{00ED}', ['\xEE'] = '\u{00EE}', ['\xEF'] = '\u{00EF}', ['\xF0'] = '\u{00F0}', ['\xF1'] = '\u{00F1}', ['\xF2'] = '\u{00F2}', ['\xF3'] = '\u{00F3}', ['\xF4'] = '\u{00F4}', ['\xF5'] = '\u{00F5}', ['\xF6'] = '\u{00F6}', ['\xF7'] = '\u{00F7}', ['\xF8'] = '\u{00F8}', ['\xF9'] = '\u{00F9}', ['\xFA'] = '\u{00FA}', ['\xFB'] = '\u{00FB}', ['\xFC'] = '\u{00FC}', ['\xFD'] = '\u{00FD}', ['\xFE'] = '\u{00FE}', ['\xFF'] = '\u{00FF}' } local winansi_to_utf8 = lpeg.Cs(( lpeg.R('\x00\x7F') + lpeg.R('\x80\xFF') / winansi_mapping )^0) * -1 local winansi_mapping_utf16be = {} for k, v in pairs(winansi_mapping) do winansi_mapping_utf16be[k] = utf8cp_to_utf16be(v) end local winansi_to_utf16be = lpeg.Cs(( lpeg.R'\x80\xFF' / winansi_mapping_utf16be + lpeg.Cc'\x00' * 1 )^0) * -1 local macroman_mapping = { ['\x80'] = '\u{00C4}', ['\x81'] = '\u{00C5}', ['\x82'] = '\u{00C7}', ['\x83'] = '\u{00C9}', ['\x84'] = '\u{00D1}', ['\x85'] = '\u{00D6}', ['\x86'] = '\u{00DC}', ['\x87'] = '\u{00E1}', ['\x88'] = '\u{00E0}', ['\x89'] = '\u{00E2}', ['\x8A'] = '\u{00E4}', ['\x8B'] = '\u{00E3}', ['\x8C'] = '\u{00E5}', ['\x8D'] = '\u{00E7}', ['\x8E'] = '\u{00E9}', ['\x8F'] = '\u{00E8}', ['\x90'] = '\u{00EA}', ['\x91'] = '\u{00EB}', ['\x92'] = '\u{00ED}', ['\x93'] = '\u{00EC}', ['\x94'] = '\u{00EE}', ['\x95'] = '\u{00EF}', ['\x96'] = '\u{00F1}', ['\x97'] = '\u{00F3}', ['\x98'] = '\u{00F2}', ['\x99'] = '\u{00F4}', ['\x9A'] = '\u{00F6}', ['\x9B'] = '\u{00F5}', ['\x9C'] = '\u{00FA}', ['\x9D'] = '\u{00F9}', ['\x9E'] = '\u{00FB}', ['\x9F'] = '\u{00FC}', ['\xA0'] = '\u{2020}', ['\xA1'] = '\u{00B0}', ['\xA2'] = '\u{00A2}', ['\xA3'] = '\u{00A3}', ['\xA4'] = '\u{00A7}', ['\xA5'] = '\u{2022}', ['\xA6'] = '\u{00B6}', ['\xA7'] = '\u{00DF}', ['\xA8'] = '\u{00AE}', ['\xA9'] = '\u{00A9}', ['\xAA'] = '\u{2122}', ['\xAB'] = '\u{00B4}', ['\xAC'] = '\u{00A8}', ['\xAD'] = '\u{2260}', ['\xAE'] = '\u{00C6}', ['\xAF'] = '\u{00D8}', ['\xB0'] = '\u{221E}', ['\xB1'] = '\u{00B1}', ['\xB2'] = '\u{2264}', ['\xB3'] = '\u{2265}', ['\xB4'] = '\u{00A5}', ['\xB5'] = '\u{00B5}', ['\xB6'] = '\u{2202}', ['\xB7'] = '\u{2211}', ['\xB8'] = '\u{220F}', ['\xB9'] = '\u{03C0}', ['\xBA'] = '\u{222B}', ['\xBB'] = '\u{00AA}', ['\xBC'] = '\u{00BA}', ['\xBD'] = '\u{03A9}', ['\xBE'] = '\u{00E6}', ['\xBF'] = '\u{00F8}', ['\xC0'] = '\u{00BF}', ['\xC1'] = '\u{00A1}', ['\xC2'] = '\u{00AC}', ['\xC3'] = '\u{221A}', ['\xC4'] = '\u{0192}', ['\xC5'] = '\u{2248}', ['\xC6'] = '\u{2206}', ['\xC7'] = '\u{00AB}', ['\xC8'] = '\u{00BB}', ['\xC9'] = '\u{2026}', ['\xCA'] = '\u{00A0}', ['\xCB'] = '\u{00C0}', ['\xCC'] = '\u{00C3}', ['\xCD'] = '\u{00D5}', ['\xCE'] = '\u{0152}', ['\xCF'] = '\u{0153}', ['\xD0'] = '\u{2013}', ['\xD1'] = '\u{2014}', ['\xD2'] = '\u{201C}', ['\xD3'] = '\u{201D}', ['\xD4'] = '\u{2018}', ['\xD5'] = '\u{2019}', ['\xD6'] = '\u{00F7}', ['\xD7'] = '\u{25CA}', ['\xD8'] = '\u{00FF}', ['\xD9'] = '\u{0178}', ['\xDA'] = '\u{2044}', ['\xDB'] = '\u{20AC}', ['\xDC'] = '\u{2039}', ['\xDD'] = '\u{203A}', ['\xDE'] = '\u{FB01}', ['\xDF'] = '\u{FB02}', ['\xE0'] = '\u{2021}', ['\xE1'] = '\u{00B7}', ['\xE2'] = '\u{201A}', ['\xE3'] = '\u{201E}', ['\xE4'] = '\u{2030}', ['\xE5'] = '\u{00C2}', ['\xE6'] = '\u{00CA}', ['\xE7'] = '\u{00C1}', ['\xE8'] = '\u{00CB}', ['\xE9'] = '\u{00C8}', ['\xEA'] = '\u{00CD}', ['\xEB'] = '\u{00CE}', ['\xEC'] = '\u{00CF}', ['\xED'] = '\u{00CC}', ['\xEE'] = '\u{00D3}', ['\xEF'] = '\u{00D4}', ['\xF0'] = '\u{1F34F}', -- apple '\u{F8FF}', ['\xF1'] = '\u{00D2}', ['\xF2'] = '\u{00DA}', ['\xF3'] = '\u{00DB}', ['\xF4'] = '\u{00D9}', ['\xF5'] = '\u{0131}', ['\xF6'] = '\u{02C6}', ['\xF7'] = '\u{02DC}', ['\xF8'] = '\u{00AF}', ['\xF9'] = '\u{02D8}', ['\xFA'] = '\u{02D9}', ['\xFB'] = '\u{02DA}', ['\xFC'] = '\u{00B8}', ['\xFD'] = '\u{02DD}', ['\xFE'] = '\u{02DB}', ['\xFF'] = '\u{02C7}' } local macroman_to_utf8 = lpeg.Cs(( lpeg.R('\x00\x7F') / function(c) return utf8.char(string.byte(c)) end + lpeg.R('\x80\xFF') / macroman_mapping )^0) * -1 local macroman_mapping_utf16be = {} for k, v in pairs(macroman_mapping) do macroman_mapping_utf16be[k] = utf8cp_to_utf16be(v) end local macroman_to_utf16be = lpeg.Cs(( lpeg.R'\x80\xFF' / macroman_mapping_utf16be + lpeg.Cc'\x00' * 1 )^0) * -1 return { utf16be_to_utf8 = utf16be_to_utf8, text_string_to_utf8 = text_string_to_utf8, winansi_to_utf8 = winansi_to_utf8, winansi_to_utf16be = winansi_to_utf16be, macroman_to_utf8 = macroman_to_utf8, macroman_to_utf16be = macroman_to_utf16be, }