Python で半角・全角の変換やカタカナとひらがなの変換などを行う簡単なモジュールを作ったので貼っておきます。他にも似たライブラリがありますがその一つとして。もしお役に立ちそうに思われた方はご自由にお使いください。で、何か不具合があれば教えてください♪
特徴
- unicodedata よりは色々細かく指定できます。
- 追加の変換テーブルを指定するのも簡単です。
- 変換しない文字を指定できます。
- 変換方法は単純に replace です。正規表現ほか色々と試したんですが、たぶん早い方です。
ドキュメント
モジュール、関数に記載の doc を参照してください。
ダウンロード
https://github.com/yuka2py/cnvk
コード
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
#!/usr/bin/env python # -*- coding: utf-8 -*- """ cnvk 0.9.3 - 全角・半角・ひらがな・カタカナ等を変換する簡単なモジュールです Author: yuka2py Lisence: Artistic License 2.0 Usage: import cnvk text = cnvk.convert(text, cnvk.H_ALPHA, cnvk.H_NUM) #英数字を半角に変換 text = cnvk.convert(text, cnvk.H_ALPHA, cnvk.H_NUM, {u"-":u"-"}) #追加の変換を dict で指示 text = cnvk.convert(text, cnvk.HIRA2KATA, cnvk.H_KATA) #ひらがなも含め、半角カタカナに変換 text = cnvk.convert(text, cnvk.Z_KATA, cnvk.KATA2HIRA) #カタカナも含め、全角ひらがなに変換 text = cnvk.convert(text, cnvk.HAC, skip=u"$&") #u"$" と u"&" 以外の ASCII 文字を半角に変換 """ def convert(text, *maps, **ops): """ 変換マップを指定して、文字列を変換します。 追加の変換マップを dict や tuple で簡単に利用できます。 処理をスキップする文字を指定することが出来ます。 args: text: 変換元のテキスト。unicode を必要とします。 maps: 変換マップの指定。tuple, dict または tuple を返す関数(callable オブジェクト)で指定。 マップは指定された順序に実行されます。 skip: 変換しない除外文字の指定。tuple または文字列で指定 tuple で指定すると各要素を除外。文字列で指定すると含まれる全ての文字を除外。 return: converted unicode string built-in maps: H_SPACE (HS): スペースを半角に統一 H_NUM (HN): 数字を半角に統一 H_ALPHA (HA): 英字を半角に統一 H_KIGO (HKG): ASCII記号を半角に統一 H_KATA (HK): カタカナを半角カタカナに統一 H_ASCII (HAC): アスキー文字を半角に統一(スペースを除く)(=H_NUM + H_ALPHA + H_KIGO) Z_SPACE (ZS): スペースを全角に統一 Z_NUM (ZN): 数字を全角に統一 Z_ALPHA (ZA): 英字を全角に統一 Z_KIGO (ZKG): ASCII記号を全角に統一 Z_KATA (ZK): カタカナを全角に統一 Z_ASCII (ZAC): アスキー文字を全角に統一(スペースを除く)(=Z_NUM + Z_ALPHA + Z_KIGO) HIRA2KATA (H2K): ひらがなをカタカナに変換 KATA2HIRA (K2H): カタカナをひらがなに変換 """ if "skip" in ops: skip = ops["skip"] if isinstance(skip, basestring): skip = tuple(skip) def replace(text, fr, to): return text if fr in skip else text.replace(fr, to) else: def replace(text, fr, to): return text.replace(fr, to) for map in maps: if callable(map): map = map() elif isinstance(map, dict): map = map.items() for fr, to in map: text = replace(text, fr, to) return text H_SPACE = HS = ((u" ",u" "),) H_NUM = HN = ( (u"0",u"0"),(u"1",u"1"),(u"2",u"2"),(u"3",u"3"),(u"4",u"4"), (u"5",u"5"),(u"6",u"6"),(u"7",u"7"),(u"8",u"8"),(u"9",u"9"), ) H_ALPHA = HA = ( (u"a",u"a"),(u"b",u"b"),(u"c",u"c"),(u"d",u"d"),(u"e",u"e"), (u"f",u"f"),(u"g",u"g"),(u"h",u"h"),(u"i",u"i"),(u"j",u"j"), (u"k",u"k"),(u"l",u"l"),(u"m",u"m"),(u"n",u"n"),(u"o",u"o"), (u"p",u"p"),(u"q",u"q"),(u"r",u"r"),(u"s",u"s"),(u"t",u"t"), (u"u",u"u"),(u"v",u"v"),(u"w",u"w"),(u"x",u"x"),(u"y",u"y"),(u"z",u"z"), (u"A",u"A"),(u"B",u"B"),(u"C",u"C"),(u"D",u"D"),(u"E",u"E"), (u"F",u"F"),(u"G",u"G"),(u"H",u"H"),(u"I",u"I"),(u"J",u"J"), (u"K",u"K"),(u"L",u"L"),(u"M",u"M"),(u"N",u"N"),(u"O",u"O"), (u"P",u"P"),(u"Q",u"Q"),(u"R",u"R"),(u"S",u"S"),(u"T",u"T"), (u"U",u"U"),(u"V",u"V"),(u"W",u"W"),(u"X",u"X"),(u"Y",u"Y"),(u"Z",u"Z"), ) H_KIGO = HKG = ( (u".",u"."),(u",",u","),(u"!",u"!"),(u"?",u"?"),(u"”",u'"'), (u"’",u"'"),(u"‘",u"`"),(u"@",u"@"),(u"_",u"_"),(u":",u":"), (u";",u";"),(u"#",u"#"),(u"$",u"$"),(u"%",u"%"),(u"&",u"&"), (u"(",u"("),(u")",u")"),(u"‐",u"-"),(u"=",u"="),(u"*",u"*"), (u"+",u"+"),(u"-",u"-"),(u"/",u"/"),(u"<",u"<"),(u">",u">"), (u"[",u"["),(u"¥",u"\"),(u"]",u"]"),(u"^",u"^"),(u"{",u"{"), (u"|",u"|"),(u"}",u"}"),(u"~",u"~") ) H_KATA = HK = ( (u"ァ",u"ァ"),(u"ィ",u"ィ"),(u"ゥ",u"ゥ"),(u"ェ",u"ェ"),(u"ォ",u"ォ"), (u"ッ",u"ッ"),(u"ャ",u"ャ"),(u"ュ",u"ュ"),(u"ョ",u"ョ"), (u"ガ",u"ガ"),(u"ギ",u"ギ"),(u"グ",u"グ"),(u"ゲ",u"ゲ"),(u"ゴ",u"ゴ"), (u"ザ",u"ザ"),(u"ジ",u"ジ"),(u"ズ",u"ズ"),(u"ゼ",u"ゼ"),(u"ゾ",u"ゾ"), (u"ダ",u"ダ"),(u"ヂ",u"ヂ"),(u"ヅ",u"ヅ"),(u"デ",u"デ"),(u"ド",u"ド"), (u"バ",u"バ"),(u"ビ",u"ビ"),(u"ブ",u"ブ"),(u"ベ",u"ベ"),(u"ボ",u"ボ"), (u"パ",u"パ"),(u"ピ",u"ピ"),(u"プ",u"プ"),(u"ペ",u"ペ"),(u"ポ",u"ポ"), (u"ヴ",u"ヴ"), (u"ア",u"ア"),(u"イ",u"イ"),(u"ウ",u"ウ"),(u"エ",u"エ"),(u"オ",u"オ"), (u"カ",u"カ"),(u"キ",u"キ"),(u"ク",u"ク"),(u"ケ",u"ケ"),(u"コ",u"コ"), (u"サ",u"サ"),(u"シ",u"シ"),(u"ス",u"ス"),(u"セ",u"セ"),(u"ソ",u"ソ"), (u"タ",u"タ"),(u"チ",u"チ"),(u"ツ",u"ツ"),(u"テ",u"テ"),(u"ト",u"ト"), (u"ナ",u"ナ"),(u"ニ",u"ニ"),(u"ヌ",u"ヌ"),(u"ネ",u"ネ"),(u"ノ",u"ノ"), (u"ハ",u"ハ"),(u"ヒ",u"ヒ"),(u"フ",u"フ"),(u"ヘ",u"ヘ"),(u"ホ",u"ホ"), (u"マ",u"マ"),(u"ミ",u"ミ"),(u"ム",u"ム"),(u"メ",u"メ"),(u"モ",u"モ"), (u"ヤ",u"ヤ"),(u"ユ",u"ユ"),(u"ヨ",u"ヨ"), (u"ラ",u"ラ"),(u"リ",u"リ"),(u"ル",u"ル"),(u"レ",u"レ"),(u"ロ",u"ロ"), (u"ワ",u"ワ"),(u"ヲ",u"ヲ"),(u"ン",u"ン"), (u"。",u"。"),(u"、",u"、"),(u"゛",u"゙"),(u"゜",u"゚"), (u"「",u"「"),(u"」",u"」"),(u"・",u"・"),(u"ー",u"ー"), ) HIRA2KATA = ( (u"ぁ",u"ァ"),(u"ぃ",u"ィ"),(u"ぅ",u"ゥ"),(u"ぇ",u"ェ"),(u"ぉ",u"ォ"), (u"っ",u"ッ"),(u"ゃ",u"ャ"),(u"ゅ",u"ュ"),(u"ょ",u"ョ"), (u"が",u"ガ"),(u"ぎ",u"ギ"),(u"ぐ",u"グ"),(u"げ",u"ゲ"),(u"ご",u"ゴ"), (u"ざ",u"ザ"),(u"じ",u"ジ"),(u"ず",u"ズ"),(u"ぜ",u"ゼ"),(u"ぞ",u"ゾ"), (u"だ",u"ダ"),(u"ぢ",u"ヂ"),(u"づ",u"ヅ"),(u"で",u"デ"),(u"ど",u"ド"), (u"ば",u"バ"),(u"び",u"ビ"),(u"ぶ",u"ブ"),(u"べ",u"ベ"),(u"ぼ",u"ボ"), (u"ぱ",u"パ"),(u"ぴ",u"ピ"),(u"ぷ",u"プ"),(u"ぺ",u"ペ"),(u"ぽ",u"ポ"), (u"ヴ",u"ヴ"), (u"あ",u"ア"),(u"い",u"イ"),(u"う",u"ウ"),(u"え",u"エ"),(u"お",u"オ"), (u"か",u"カ"),(u"き",u"キ"),(u"く",u"ク"),(u"け",u"ケ"),(u"こ",u"コ"), (u"さ",u"サ"),(u"し",u"シ"),(u"す",u"ス"),(u"せ",u"セ"),(u"そ",u"ソ"), (u"た",u"タ"),(u"ち",u"チ"),(u"つ",u"ツ"),(u"て",u"テ"),(u"と",u"ト"), (u"な",u"ナ"),(u"に",u"ニ"),(u"ぬ",u"ヌ"),(u"ね",u"ネ"),(u"の",u"ノ"), (u"は",u"ハ"),(u"ひ",u"ヒ"),(u"ふ",u"フ"),(u"へ",u"ヘ"),(u"ほ",u"ホ"), (u"ま",u"マ"),(u"み",u"ミ"),(u"む",u"ム"),(u"め",u"メ"),(u"も",u"モ"), (u"や",u"ヤ"),(u"ゆ",u"ユ"),(u"よ",u"ヨ"), (u"ら",u"ラ"),(u"り",u"リ"),(u"る",u"ル"),(u"れ",u"レ"),(u"ろ",u"ロ"), (u"わ",u"ワ"),(u"を",u"ヲ"),(u"ん",u"ン"), ) Z_SPACE = ZS = ((u" ",u" "),) Z_NUM = ZN = lambda: ((h, z) for z, h in H_NUM) Z_ALPHA = ZA = lambda: ((h, z) for z, h in H_ALPHA) Z_KIGO = ZKG = lambda: ((h, z) for z, h in H_KIGO) Z_KATA = ZK = lambda: ((h, z) for z, h in H_KATA) KATA2HIRA = lambda: ((k, h) for h, k in HIRA2KATA) H_ASCII = HAC = lambda: ((fr, to) for map in (H_ALPHA, H_NUM, H_KIGO) for fr, to in map) Z_ASCII = ZAC = lambda: ((h, z) for z, h in H_ASCII()) |