View All Posts. MiCHiLU.com powered by Django ;-)

[Django]: サロゲートペア文字を検出するカスタムフィールド

サロゲートペア文字が流行ってきているらしいので対策を施す。 MySQLはサロゲートペアを落としてしまうようなのでバリデータで弾いてみよう。

utils/fields.py

from django import newforms as forms
from django.newforms.util import ValidationError
from django.utils.translation import ugettext
from django.conf import settings
import re

def has_surrogate_pair(strings):
    if type(strings) is not unicode:
        strings = unicode(strings, settings.DEFAULT_CHARSET)
    strings = list(strings)
    high = re.compile(u"[\uD800-\uDBFF]")
    low = re.compile(u"[\uDC00-\uDFFF]")
    try:
        while 1:
            if high.match(strings.pop(0)) and low.match(strings.pop(0)):
                return True
    except IndexError:
        return False


class CharField(forms.CharField):
    def __init__(self, surrogate_pair=True, *args, **kwargs):
        super(CharField, self).__init__(*args, **kwargs)
        self.surrogate_pair = surrogate_pair

    def clean(self, value):
        value = super(CharField, self).clean(value)
        if not self.surrogate_pair and has_surrogate_pair(value):
            raise ValidationError(ugettext(u'Ensure this value has not surrogate pair characters.'))
        return value
>>> field = CharField()
>>> field.clean("森鷗外")  # 正式版の"森鴎外"
u'\u68ee\u9dd7\u5916'
>>> field.clean("𣏐")    # "木"+"夕"
u'\U000233d0'

>>> field = CharField(surrogate_pair=False)
>>> field.clean("森鷗外")  # 正式版の"森鴎外"
u'\u68ee\u9dd7\u5916'
>>> field.clean("𣏐")   # "木"+"夕"
    ...
    ValidationError: [u'Ensure this value has not surrogate pair characters.']

試しに、このようなviewを用意して、

from django.http import HttpResponse
from django import newforms as forms
from utils import fields

class TestForm(forms.Form):
    text1 = fields.CharField()
    text2 = fields.CharField(
      surrogate_pair=False
    )

def test(request):
    return HttpResponse(str(TestForm(request.GET)))

などとアクセスしてみると様子がわかります。

messageは適当、国際化がよくわからないなぁ、勉強しよう。

Fri, 7 Sep 2007 17:25:53 +0900 source edit
Creative Commons License
This work is licensed under a Creative Commons Attribution-Noncommercial-Share Alike 2.1 Japan License.
View All Posts. MiCHiLU.com powered by Django ;-)