#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
cnvk 0.9.3 - 全角・半角・ひらがな・カタカナ等を変換する簡単なモジュールです
Author:
yuka2py
Lisence:
Artistic License 2.0
Usage:
import cnvk
text = cnvk.convert(text, cnvk.H_ALPHA, cnvk.H_NUM) #英数字を半角に変換
text = cnvk.convert(text, cnvk.H_ALPHA, cnvk.H_NUM, {u"-":u"-"}) #追加の変換を dict で指示
text = cnvk.convert(text, cnvk.HIRA2KATA, cnvk.H_KATA) #ひらがなも含め、半角カタカナに変換
text = cnvk.convert(text, cnvk.Z_KATA, cnvk.KATA2HIRA) #カタカナも含め、全角ひらがなに変換
text = cnvk.convert(text, cnvk.HAC, skip=u"$&") #u"$" と u"&" 以外の ASCII 文字を半角に変換
"""
def convert(text, *maps, **ops):
""" 変換マップを指定して、文字列を変換します。
追加の変換マップを dict や tuple で簡単に利用できます。
処理をスキップする文字を指定することが出来ます。
args:
text: 変換元のテキスト。unicode を必要とします。
maps: 変換マップの指定。tuple, dict または tuple を返す関数(callable オブジェクト)で指定。
マップは指定された順序に実行されます。
skip: 変換しない除外文字の指定。tuple または文字列で指定
tuple で指定すると各要素を除外。文字列で指定すると含まれる全ての文字を除外。
return:
converted unicode string
built-in maps:
H_SPACE (HS): スペースを半角に統一
H_NUM (HN): 数字を半角に統一
H_ALPHA (HA): 英字を半角に統一
H_KIGO (HKG): ASCII記号を半角に統一
H_KATA (HK): カタカナを半角カタカナに統一
H_ASCII (HAC): アスキー文字を半角に統一(スペースを除く)(=H_NUM + H_ALPHA + H_KIGO)
Z_SPACE (ZS): スペースを全角に統一
Z_NUM (ZN): 数字を全角に統一
Z_ALPHA (ZA): 英字を全角に統一
Z_KIGO (ZKG): ASCII記号を全角に統一
Z_KATA (ZK): カタカナを全角に統一
Z_ASCII (ZAC): アスキー文字を全角に統一(スペースを除く)(=Z_NUM + Z_ALPHA + Z_KIGO)
HIRA2KATA (H2K): ひらがなをカタカナに変換
KATA2HIRA (K2H): カタカナをひらがなに変換
"""
if "skip" in ops:
skip = ops["skip"]
if isinstance(skip, basestring):
skip = tuple(skip)
def replace(text, fr, to):
return text if fr in skip else text.replace(fr, to)
else:
def replace(text, fr, to):
return text.replace(fr, to)
for map in maps:
if callable(map):
map = map()
elif isinstance(map, dict):
map = map.items()
for fr, to in map:
text = replace(text, fr, to)
return text
H_SPACE = HS = ((u" ",u" "),)
H_NUM = HN = (
(u"0",u"0"),(u"1",u"1"),(u"2",u"2"),(u"3",u"3"),(u"4",u"4"),
(u"5",u"5"),(u"6",u"6"),(u"7",u"7"),(u"8",u"8"),(u"9",u"9"),
)
H_ALPHA = HA = (
(u"a",u"a"),(u"b",u"b"),(u"c",u"c"),(u"d",u"d"),(u"e",u"e"),
(u"f",u"f"),(u"g",u"g"),(u"h",u"h"),(u"i",u"i"),(u"j",u"j"),
(u"k",u"k"),(u"l",u"l"),(u"m",u"m"),(u"n",u"n"),(u"o",u"o"),
(u"p",u"p"),(u"q",u"q"),(u"r",u"r"),(u"s",u"s"),(u"t",u"t"),
(u"u",u"u"),(u"v",u"v"),(u"w",u"w"),(u"x",u"x"),(u"y",u"y"),(u"z",u"z"),
(u"A",u"A"),(u"B",u"B"),(u"C",u"C"),(u"D",u"D"),(u"E",u"E"),
(u"F",u"F"),(u"G",u"G"),(u"H",u"H"),(u"I",u"I"),(u"J",u"J"),
(u"K",u"K"),(u"L",u"L"),(u"M",u"M"),(u"N",u"N"),(u"O",u"O"),
(u"P",u"P"),(u"Q",u"Q"),(u"R",u"R"),(u"S",u"S"),(u"T",u"T"),
(u"U",u"U"),(u"V",u"V"),(u"W",u"W"),(u"X",u"X"),(u"Y",u"Y"),(u"Z",u"Z"),
)
H_KIGO = HKG = (
(u".",u"."),(u",",u","),(u"!",u"!"),(u"?",u"?"),(u"”",u'"'),
(u"’",u"'"),(u"‘",u"`"),(u"@",u"@"),(u"_",u"_"),(u":",u":"),
(u";",u";"),(u"#",u"#"),(u"$",u"$"),(u"%",u"%"),(u"&",u"&"),
(u"(",u"("),(u")",u")"),(u"‐",u"-"),(u"=",u"="),(u"*",u"*"),
(u"+",u"+"),(u"-",u"-"),(u"/",u"/"),(u"<",u"<"),(u">",u">"),
(u"[",u"["),(u"¥",u"\"),(u"]",u"]"),(u"^",u"^"),(u"{",u"{"),
(u"|",u"|"),(u"}",u"}"),(u"~",u"~")
)
H_KATA = HK = (
(u"ァ",u"ァ"),(u"ィ",u"ィ"),(u"ゥ",u"ゥ"),(u"ェ",u"ェ"),(u"ォ",u"ォ"),
(u"ッ",u"ッ"),(u"ャ",u"ャ"),(u"ュ",u"ュ"),(u"ョ",u"ョ"),
(u"ガ",u"ガ"),(u"ギ",u"ギ"),(u"グ",u"グ"),(u"ゲ",u"ゲ"),(u"ゴ",u"ゴ"),
(u"ザ",u"ザ"),(u"ジ",u"ジ"),(u"ズ",u"ズ"),(u"ゼ",u"ゼ"),(u"ゾ",u"ゾ"),
(u"ダ",u"ダ"),(u"ヂ",u"ヂ"),(u"ヅ",u"ヅ"),(u"デ",u"デ"),(u"ド",u"ド"),
(u"バ",u"バ"),(u"ビ",u"ビ"),(u"ブ",u"ブ"),(u"ベ",u"ベ"),(u"ボ",u"ボ"),
(u"パ",u"パ"),(u"ピ",u"ピ"),(u"プ",u"プ"),(u"ペ",u"ペ"),(u"ポ",u"ポ"),
(u"ヴ",u"ヴ"),
(u"ア",u"ア"),(u"イ",u"イ"),(u"ウ",u"ウ"),(u"エ",u"エ"),(u"オ",u"オ"),
(u"カ",u"カ"),(u"キ",u"キ"),(u"ク",u"ク"),(u"ケ",u"ケ"),(u"コ",u"コ"),
(u"サ",u"サ"),(u"シ",u"シ"),(u"ス",u"ス"),(u"セ",u"セ"),(u"ソ",u"ソ"),
(u"タ",u"タ"),(u"チ",u"チ"),(u"ツ",u"ツ"),(u"テ",u"テ"),(u"ト",u"ト"),
(u"ナ",u"ナ"),(u"ニ",u"ニ"),(u"ヌ",u"ヌ"),(u"ネ",u"ネ"),(u"ノ",u"ノ"),
(u"ハ",u"ハ"),(u"ヒ",u"ヒ"),(u"フ",u"フ"),(u"ヘ",u"ヘ"),(u"ホ",u"ホ"),
(u"マ",u"マ"),(u"ミ",u"ミ"),(u"ム",u"ム"),(u"メ",u"メ"),(u"モ",u"モ"),
(u"ヤ",u"ヤ"),(u"ユ",u"ユ"),(u"ヨ",u"ヨ"),
(u"ラ",u"ラ"),(u"リ",u"リ"),(u"ル",u"ル"),(u"レ",u"レ"),(u"ロ",u"ロ"),
(u"ワ",u"ワ"),(u"ヲ",u"ヲ"),(u"ン",u"ン"),
(u"。",u"。"),(u"、",u"、"),(u"゛",u"゙"),(u"゜",u"゚"),
(u"「",u"「"),(u"」",u"」"),(u"・",u"・"),(u"ー",u"ー"),
)
HIRA2KATA = (
(u"ぁ",u"ァ"),(u"ぃ",u"ィ"),(u"ぅ",u"ゥ"),(u"ぇ",u"ェ"),(u"ぉ",u"ォ"),
(u"っ",u"ッ"),(u"ゃ",u"ャ"),(u"ゅ",u"ュ"),(u"ょ",u"ョ"),
(u"が",u"ガ"),(u"ぎ",u"ギ"),(u"ぐ",u"グ"),(u"げ",u"ゲ"),(u"ご",u"ゴ"),
(u"ざ",u"ザ"),(u"じ",u"ジ"),(u"ず",u"ズ"),(u"ぜ",u"ゼ"),(u"ぞ",u"ゾ"),
(u"だ",u"ダ"),(u"ぢ",u"ヂ"),(u"づ",u"ヅ"),(u"で",u"デ"),(u"ど",u"ド"),
(u"ば",u"バ"),(u"び",u"ビ"),(u"ぶ",u"ブ"),(u"べ",u"ベ"),(u"ぼ",u"ボ"),
(u"ぱ",u"パ"),(u"ぴ",u"ピ"),(u"ぷ",u"プ"),(u"ぺ",u"ペ"),(u"ぽ",u"ポ"),
(u"ヴ",u"ヴ"),
(u"あ",u"ア"),(u"い",u"イ"),(u"う",u"ウ"),(u"え",u"エ"),(u"お",u"オ"),
(u"か",u"カ"),(u"き",u"キ"),(u"く",u"ク"),(u"け",u"ケ"),(u"こ",u"コ"),
(u"さ",u"サ"),(u"し",u"シ"),(u"す",u"ス"),(u"せ",u"セ"),(u"そ",u"ソ"),
(u"た",u"タ"),(u"ち",u"チ"),(u"つ",u"ツ"),(u"て",u"テ"),(u"と",u"ト"),
(u"な",u"ナ"),(u"に",u"ニ"),(u"ぬ",u"ヌ"),(u"ね",u"ネ"),(u"の",u"ノ"),
(u"は",u"ハ"),(u"ひ",u"ヒ"),(u"ふ",u"フ"),(u"へ",u"ヘ"),(u"ほ",u"ホ"),
(u"ま",u"マ"),(u"み",u"ミ"),(u"む",u"ム"),(u"め",u"メ"),(u"も",u"モ"),
(u"や",u"ヤ"),(u"ゆ",u"ユ"),(u"よ",u"ヨ"),
(u"ら",u"ラ"),(u"り",u"リ"),(u"る",u"ル"),(u"れ",u"レ"),(u"ろ",u"ロ"),
(u"わ",u"ワ"),(u"を",u"ヲ"),(u"ん",u"ン"),
)
Z_SPACE = ZS = ((u" ",u" "),)
Z_NUM = ZN = lambda: ((h, z) for z, h in H_NUM)
Z_ALPHA = ZA = lambda: ((h, z) for z, h in H_ALPHA)
Z_KIGO = ZKG = lambda: ((h, z) for z, h in H_KIGO)
Z_KATA = ZK = lambda: ((h, z) for z, h in H_KATA)
KATA2HIRA = lambda: ((k, h) for h, k in HIRA2KATA)
H_ASCII = HAC = lambda: ((fr, to) for map in (H_ALPHA, H_NUM, H_KIGO) for fr, to in map)
Z_ASCII = ZAC = lambda: ((h, z) for z, h in H_ASCII())