サロゲートペア文字が流行ってきているらしいので対策を施す。 MySQLはサロゲートペアを落としてしまうようなのでバリデータで弾いてみよう。
utils/fields.py
from django import newforms as forms
from django.newforms.util import ValidationError
from django.utils.translation import ugettext
from django.conf import settings
import re
def has_surrogate_pair(strings):
if type(strings) is not unicode:
strings = unicode(strings, settings.DEFAULT_CHARSET)
strings = list(strings)
high = re.compile(u"[\uD800-\uDBFF]")
low = re.compile(u"[\uDC00-\uDFFF]")
try:
while 1:
if high.match(strings.pop(0)) and low.match(strings.pop(0)):
return True
except IndexError:
return False
class CharField(forms.CharField):
def __init__(self, surrogate_pair=True, *args, **kwargs):
super(CharField, self).__init__(*args, **kwargs)
self.surrogate_pair = surrogate_pair
def clean(self, value):
value = super(CharField, self).clean(value)
if not self.surrogate_pair and has_surrogate_pair(value):
raise ValidationError(ugettext(u'Ensure this value has not surrogate pair characters.'))
return value
>>> field = CharField()
>>> field.clean("森鷗外") # 正式版の"森鴎外"
u'\u68ee\u9dd7\u5916'
>>> field.clean("𣏐") # "木"+"夕"
u'\U000233d0'
>>> field = CharField(surrogate_pair=False)
>>> field.clean("森鷗外") # 正式版の"森鴎外"
u'\u68ee\u9dd7\u5916'
>>> field.clean("𣏐") # "木"+"夕"
...
ValidationError: [u'Ensure this value has not surrogate pair characters.']
試しに、このようなviewを用意して、
from django.http import HttpResponse
from django import newforms as forms
from utils import fields
class TestForm(forms.Form):
text1 = fields.CharField()
text2 = fields.CharField(
surrogate_pair=False
)
def test(request):
return HttpResponse(str(TestForm(request.GET)))
などとアクセスしてみると様子がわかります。
messageは適当、国際化がよくわからないなぁ、勉強しよう。
