You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
306 lines
11 KiB
306 lines
11 KiB
#include <iostream>
|
|
#include <string>
|
|
#include <map>
|
|
#include <vector>
|
|
#include <iomanip>
|
|
#include <windows.h>
|
|
// https://github.com/aantron/better-enums
|
|
#include "enum.h"
|
|
|
|
/// <summary>
|
|
/// 制御文字を含まず表示可能な文字の全可能性を入力してシフトJIS(コードページ932)からワイド文字の変換表を作成して返す
|
|
/// </summary>
|
|
/// <returns>シフトJIS1文字の文字列をキー、ワイド文字1文字の文字列を値としたstd::map</returns>
|
|
auto create_sjis2ws() {
|
|
std::map<std::string, std::wstring> sjis2ws;
|
|
char ch[3];
|
|
ch[1] = 0;
|
|
ch[2] = 0;
|
|
for (ch[0] = 0x20; ch[0] != 0; ++ch[0]) {
|
|
wchar_t wch[3];
|
|
int len = MultiByteToWideChar(932, MB_ERR_INVALID_CHARS, ch, 1, wch, sizeof(wch) / sizeof(wch[0]));
|
|
if (len == 1) {
|
|
sjis2ws[std::string(ch)] = std::wstring(1, wch[0]);
|
|
}
|
|
else {
|
|
for (ch[1] = 0x20; ch[1] != 0; ++ch[1]) {
|
|
int len = MultiByteToWideChar(932, MB_ERR_INVALID_CHARS, ch, 2, wch, sizeof(wch) / sizeof(wch[0]));
|
|
if (len == 1) {
|
|
sjis2ws[std::string(ch)] = std::wstring(1, wch[0]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return sjis2ws;
|
|
}
|
|
|
|
/// <summary>
|
|
/// シフトJIS文字からワイド文字への変換表を元に、ワイド文字からシフトJIS文字ベクタへの変換表を作成して返す
|
|
/// </summary>
|
|
/// <typeparam name="K">シフトJIS文字列の型</typeparam>
|
|
/// <typeparam name="V">ワイド文字列の型</typeparam>
|
|
/// <param name="sjis2ws">シフトJIS文字からワイド文字への変換表</param>
|
|
/// <returns>ワイド文字をキー、シフトJIS文字ベクタを値としたstd::map</returns>
|
|
template<typename K, typename V>
|
|
auto create_ws2sjis(const std::map<K,V>& sjis2ws) {
|
|
std::map<std::wstring, std::vector<std::string>> ws2sjis;
|
|
for (const auto& sjis2ws_pair : sjis2ws) {
|
|
const auto& sjis = sjis2ws_pair.first;
|
|
const auto& ws = sjis2ws_pair.second;
|
|
auto it = ws2sjis.find(ws);
|
|
if (it == ws2sjis.end()) {
|
|
it = ws2sjis.insert(std::make_pair(ws, std::vector<std::string>())).first;
|
|
}
|
|
it->second.push_back(sjis);
|
|
}
|
|
return ws2sjis;
|
|
}
|
|
|
|
/// <summary>
|
|
/// シーケンスの列挙定義
|
|
/// </summary>
|
|
BETTER_ENUM(EscapeType, uint8_t,
|
|
ASCII,
|
|
JIS_X_201_1976,
|
|
JIS_X_0208_1978,
|
|
JIS_X_0208_1990,
|
|
JIS_X_0212_1990,
|
|
GB_2312_80,
|
|
KS_X_1001_1992,
|
|
ISO_IEC_8859_1_HIGH,
|
|
ISO_IEC_8859_7_HIGH,
|
|
JIS_X_0213_2000_1,
|
|
JIS_X_0213_2000_2,
|
|
JIS_X_0213_2004_1,
|
|
JIS_X_0201_KANA
|
|
);
|
|
|
|
/// <summary>
|
|
/// ISO-2022-JP-2のエスケープシーケンス
|
|
/// </summary>
|
|
const struct sequence_type {
|
|
/// <summary>
|
|
/// エスケープシーケンス文字列
|
|
/// </summary>
|
|
std::string escapes;
|
|
/// <summary>
|
|
/// 文字セットなどの名前
|
|
/// </summary>
|
|
//std::string name;
|
|
EscapeType type;
|
|
/// <summary>
|
|
/// 1文字に必要な符号の長さ[byte]
|
|
/// </summary>
|
|
int char_length;
|
|
} seq_type[] = {
|
|
{"\x1b(B", EscapeType::ASCII, 1, },
|
|
{"\x1b(J", EscapeType::JIS_X_201_1976, 1},
|
|
{"\x1b$@", EscapeType::JIS_X_0208_1978, 2},
|
|
{"\x1b$B", EscapeType::JIS_X_0208_1990, 2},
|
|
{"\x1b$(D", EscapeType::JIS_X_0212_1990, 2},
|
|
{"\x1b$A", EscapeType::GB_2312_80, 2},
|
|
{"\x1b$(C", EscapeType::KS_X_1001_1992, 2},
|
|
{"\x1b.A", EscapeType::ISO_IEC_8859_1_HIGH, 1},
|
|
{"\x1b.F", EscapeType::ISO_IEC_8859_7_HIGH, 1},
|
|
{"\x1b$(O", EscapeType::JIS_X_0213_2000_1, 2},
|
|
{"\x1b$(P", EscapeType::JIS_X_0213_2000_2, 2},
|
|
{"\x1b$(Q", EscapeType::JIS_X_0213_2004_1, 2},
|
|
{"\x1b(I", EscapeType::JIS_X_0201_KANA, 1},
|
|
//{"\x1b$(?", "JIS X 0208-1990(gaiji)", 2},
|
|
};
|
|
|
|
/// <summary>
|
|
/// JIS文字列の先頭のエスケープシーケンスを読み取り、種別を判断して返す
|
|
/// </summary>
|
|
/// <typeparam name="T">文字列型</typeparam>
|
|
/// <param name="s">JIS文字列</param>
|
|
/// <returns>エスケープシーケンスタイプを表すポインタ</returns>
|
|
template<typename T>
|
|
auto find_and_cut_sequence(T& s) {
|
|
const sequence_type* pst = &seq_type[0];
|
|
for (const auto& st : seq_type) {
|
|
const auto& esc = st.escapes;
|
|
if (s.substr(0, esc.length()) == esc) {
|
|
pst = &st;
|
|
s = s.substr(esc.length(), st.char_length);
|
|
break;
|
|
}
|
|
}
|
|
if (pst == &seq_type[0] && s[0] == '\x1b') {
|
|
throw std::exception("unknown escape sequence");
|
|
}
|
|
return pst;
|
|
}
|
|
|
|
/// <summary>
|
|
/// ISO-2022-JP-2からワイド文字列への変換表を作成して返す
|
|
/// </summary>
|
|
/// <typeparam name="T">std::pairをiterate可能なコンテナ型</typeparam>
|
|
/// <param name="ws2sjis">std::pairのfirstにワイド文字列が入っているコンテナ</param>
|
|
/// <returns>ISO-2022-JP-2(文字セット名, 文字列のstd::pair)からワイド文字列への変換表</returns>
|
|
template<typename T>
|
|
auto create_jis2ws(const T& ws2sjis) {
|
|
std::map<std::pair<EscapeType, std::string>, std::wstring> jis2ws;
|
|
for (const auto& ws2sjis_pair : ws2sjis) {
|
|
const auto& ws = ws2sjis_pair.first;
|
|
char jis[100];
|
|
auto len = WideCharToMultiByte(50221, 0, ws.c_str(), static_cast<int>(ws.length()), jis, sizeof(jis) / sizeof(jis[0]), NULL, NULL);
|
|
if (len <= 0 || (jis[0] == '?' && ws[0] != L'?')) continue;
|
|
std::string jisstr(jis, len);
|
|
auto pst = find_and_cut_sequence(jisstr);
|
|
auto key = std::make_pair(pst->type, jisstr);
|
|
if (jis2ws.find(key) != jis2ws.end()) {
|
|
throw std::exception("duplicated jis code");
|
|
}
|
|
jis2ws[key] = ws;
|
|
}
|
|
return jis2ws;
|
|
}
|
|
|
|
/// <summary>
|
|
/// シフトJISコードの1バイト目と2バイト目を元にJIS X 208の区だけを返す
|
|
/// </summary>
|
|
/// <param name="first">1バイト目</param>
|
|
/// <param name="second">2バイト目</param>
|
|
/// <returns>区</returns>
|
|
std::uint8_t get_jis208_ku(std::uint8_t first, std::uint8_t second) {
|
|
return (first << 1) - (first <= 0x9fU ? 0x0U : 0x80U) - (second < 0x9fU);
|
|
//return ((first - (first <= 0x9f) ? 0x81 : 0xc1) << 1) + 1 + (second >= 0x9f);
|
|
}
|
|
|
|
/// <summary>
|
|
/// シフトJIS1文字分の文字列を受け取り、その元の文字セット分類を返す
|
|
/// </summary>
|
|
/// <typeparam name="T">文字列型</typeparam>
|
|
/// <param name="s">シフトJIS1バイト分の文字列</param>
|
|
/// <returns>分類を表す文字列</returns>
|
|
template<typename T>
|
|
auto get_sjis_type(const T& s) {
|
|
if (s.length() == 0) throw std::exception("sjis empty!");
|
|
if (static_cast<std::uint8_t>(s[0]) < 0x80U || (0xa1U <= static_cast<std::uint8_t>(s[0]) && static_cast<std::uint8_t>(s[0]) <= 0xdfU)) {
|
|
return std::string("JIS X 0201:1997 (ラテン文字・片仮名)");
|
|
}
|
|
else {
|
|
if (s.length() <= 1) throw std::exception("sjis 2nd byte not found");
|
|
auto ku = get_jis208_ku(s[0], s[1]);
|
|
if (ku == 13) {
|
|
return std::string("NEC特殊文字");
|
|
}
|
|
else if (89 <= ku && ku <= 92) {
|
|
return std::string("NEC選定IBM拡張文字");
|
|
}
|
|
else if (115 <= ku && ku <= 119) {
|
|
return std::string("IBM拡張文字");
|
|
}
|
|
else if (95 <= ku && ku <= 114) {
|
|
return std::string("ユーザー定義外字");
|
|
}
|
|
else {
|
|
return std::string("JIS X 0208:1997");
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// JISコード(タイプ&コード)から区を抽出し、サブタイプを判断して返す
|
|
/// </summary>
|
|
/// <typeparam name="F">シーケンスタイプ型</typeparam>
|
|
/// <typeparam name="S">JIS文字列型</typeparam>
|
|
/// <param name="jis">JIS文字列</param>
|
|
/// <returns>サブタイプを表す文字列</returns>
|
|
template<typename F, typename S>
|
|
auto get_jis_subtype(const std::pair<F,S>& jis) {
|
|
const auto ku = jis.second[0] - 0x20;
|
|
if (jis.first == +EscapeType::JIS_X_0208_1990) {
|
|
if (1 << ku && ku <= 8) {
|
|
return std::string("記号、英数字、かな");
|
|
}
|
|
else if (16 <= ku && ku <= 47) {
|
|
return std::string("第1水準漢字");
|
|
}
|
|
else if (48 <= ku && ku <= 84){
|
|
return std::string("第2水準漢字");
|
|
}
|
|
return std::string("未定義");
|
|
}
|
|
else if (jis.first == +EscapeType::ASCII) {
|
|
return std::string("");
|
|
}
|
|
else if (jis.first == +EscapeType::JIS_X_0201_KANA) {
|
|
return std::string("");
|
|
}
|
|
else if (jis.first == +EscapeType::JIS_X_0212_1990) {
|
|
if (2 <= ku and ku <= 11) {
|
|
return std::string("非漢字");
|
|
}
|
|
if (12 <= ku and ku <= 77) {
|
|
return std::string("漢字");
|
|
}
|
|
return std::string("未定義");
|
|
}
|
|
else {
|
|
throw std::exception("unkown type");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// ISO-2022-JP-2順に並んだUnicodeとシフトJISの一覧表jsonを標準出力に出す
|
|
/// </summary>
|
|
/// <typeparam name="JIS">ISO-2022-JP-2の型</typeparam>
|
|
/// <typeparam name="WS">ワード文字列の型</typeparam>
|
|
/// <typeparam name="SJIS">シフトJISの型</typeparam>
|
|
/// <param name="jis2ws">ISO-2022-JP-2(文字セット名, 文字列のstd::pair)からワイド文字列への変換表</param>
|
|
/// <param name="ws2sjis">ワイド文字をキー、シフトJIS文字ベクタを値としたstd::map</param>
|
|
template<typename JIS, typename WS, typename SJIS>
|
|
void print_json(const std::map<JIS, WS>& jis2ws, const std::map<WS, SJIS>& ws2sjis) {
|
|
std::cout << "[";
|
|
bool is_first = true;
|
|
for (const auto& jis2ws_pair : jis2ws) {
|
|
const auto& jis = jis2ws_pair.first;
|
|
const auto& ws = jis2ws_pair.second;
|
|
if (is_first) {
|
|
is_first = false;
|
|
}
|
|
else {
|
|
std::cout << ",";
|
|
}
|
|
std::cout << "{\"unicode\":" << static_cast<int>(ws[0]) << ","
|
|
<< "\"jis\":{\"type\":\"" << jis.first << "\","
|
|
<< "\"code\":";
|
|
unsigned int code = 0;
|
|
for (auto ch : jis.second) code = (code << (sizeof(ch)*8)) | static_cast<std::uint8_t>(ch);
|
|
std::cout
|
|
<< code
|
|
<< ",\"subtype\":\"" << get_jis_subtype(jis)
|
|
<< "\"},"
|
|
<< "\"ms932\":[";
|
|
bool is_first_s = true;
|
|
for (const auto& s : ws2sjis.at(ws)) {
|
|
if (is_first_s) is_first_s = false;
|
|
else std::cout << ",";
|
|
std::cout << "{\"code\":";
|
|
code = 0;
|
|
for (auto ch : s) code = (code << (sizeof(ch) * 8)) | static_cast<std::uint8_t>(ch);
|
|
std::cout << code;
|
|
std::cout << ",\"type\":\"" << get_sjis_type(s) << "\"}";
|
|
}
|
|
std::cout << "]}" << std::endl;
|
|
}
|
|
std::cout << "]" << std::endl;
|
|
}
|
|
|
|
/// <summary>
|
|
/// メイン関数
|
|
/// </summary>
|
|
/// <param name="argc">コマンド+引数の数</param>
|
|
/// <param name="argv">コマンド,引数1,引数2,...の文字列</param>
|
|
/// <returns>プロセス戻り値</returns>
|
|
int main(int argc, char* argv[])
|
|
{
|
|
setlocale(LC_CTYPE, ".UTF-8");
|
|
auto sjis2ws = create_sjis2ws();
|
|
auto ws2sjis = create_ws2sjis(sjis2ws);
|
|
auto jis2ws = create_jis2ws(ws2sjis);
|
|
print_json(jis2ws, ws2sjis);
|
|
return 0;
|
|
}
|