2014-05-10 2 views
1

.NET, Java, Perl, PHP, Python3 (?) 모두 PCRE의 \p{L} 정규 표현식을 지원하는 문자를 나타내는 유니 코드 문자와 일치하지만 자바 스크립트에는 이러한 바로 가기가 없습니다. 알았어) ... 나는 문자열 조작에 초점을 맞춘 라이브러리에서 작업하고 있는데, 자바 스크립트에 대해서는 동등한 처리가 필요하다. 지금까지 길이가 정규식 인 1172자를 가지고 있는데, 다소 어수선하게 만들어졌습니다. 나는 누군가가 그것을 바로 잡을 수 있다면 부인할 수 있겠습니까? 이상, 좀 더 일반적이고 정확하게 만드는 법. 다음은 위에서 아래까지의 방법입니다.모든 언어의 편지를 말하십시오.

// 
// JavaScript synonym for (.NET/Java/Perl/PCRE)'s `\p{L}` regexp. 


// get range of characters 
function crange (a, z) { 

    var rng = []; 

    if (a <= z) { 

    for (

     var cc = a.charCodeAt(0) - 1, 
     stop = z.charCodeAt(0), 
     fromcc = String.fromCharCode; 

     ++cc <= stop; 

     rng.push(fromcc(cc)) 

    ); 

    } 

    return rng; 
} 

// maps a list to another 2D-list 
// containing arrays with successive integers in it 
Array.prototype.intranges = function() { 

    for (

    var it = 0, 
     // filter unique and numericaly sorted 
     // list of integers from given array 
     self = this.filter(_intranges).sort(_nsort), 
     len = self.length, 
     res = [], 
     buff, 
     curr; 

    buff = curr = self[it], it < len; 

    // ignore update 

) { 

    // increment while integers are successive 
    while (self[(++it)] == (++buff)); 

    // save 
    res.push(
     (self[(self.indexOf(curr) + 1)] == self[it]) ? 
     [curr] : [curr, self[(it - 1)]] 
    ); 

    } 

    return res; 
}; 


var letter_regex = 

    // get all characters < 0xffff; 
    crange('\u0000', '\uffff') 

    // create [(int) codePoint, (char) character] pairs 
    .map(function (c, i) { 
    return [i, c]; 
    }) 

    // this one is tricky... 
    // what holds true for a character 
    // that is regular letter, 
    // not punctuation, whitespace, number, 
    // or any other (fancy) unicode symbol? 
    // 
    // I'm sure this part can be improved. 
    // It checks if a character has it's 
    // lower/upper-case version, 
    // assuming it's true for letters only... 
    .filter(function (pair) { 
    var p1 = pair[1]; 
    return p1.toUpperCase() != p1.toLowerCase(); 
    }) 

    // fetch those code-points 
    .map(function (pair) { 
    return pair[0]; 
    }) 

    // build integer subranges out of them 
    .intranges() 

    // build a string out of it 
    // that can be used by `RegExp` 
    .map(function (ccrange) { 
    return ccrange.map(function (cc) { 
     var c = cc.toString(16); 
     return (cc <= 0xff) ? ('\\x' + pad02(c)) : ('\\u' + pad04(c)); 
    }).join('-'); 
    }) 
    .join(''); 

// 
// 
// and it generated this (10ft) long string: 
// 
// letter_regex = '\x41-\x5A\x61-\x7A\xB5\xC0-\xD6\xD8-\xDE\xE0-\xF6\xF8-\u0137\u0139-\u0148\u014A-\u018C\u018E-\u019A\u019C-\u01A9\u01AC-\u01B9\u01BC-\u01BD\u01BF\u01C4-\u01EF\u01F1-\u0220\u0222-\u0233\u023A-\u0254\u0256-\u0257\u0259\u025B\u0260\u0263\u0265-\u0266\u0268-\u0269\u026B\u026F\u0271-\u0272\u0275\u027D\u0280\u0283\u0288-\u028C\u0292\u0345\u0370-\u0373\u0376-\u0377\u037B-\u037D\u0386\u0388-\u038A\u038C\u038E-\u038F\u0391-\u03A1\u03A3-\u03AF\u03B1-\u03D1\u03D5-\u03F2\u03F4-\u03F5\u03F7-\u03FB\u03FD-\u0481\u048A-\u0527\u0531-\u0556\u0561-\u0586\u10A0-\u10C5\u10C7\u10CD\u1D79\u1D7D\u1E00-\u1E95\u1E9B\u1E9E\u1EA0-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F51\u1F53\u1F55\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB1\u1FB3\u1FB8-\u1FBC\u1FBE\u1FC3\u1FC8-\u1FCC\u1FD0-\u1FD1\u1FD8-\u1FDB\u1FE0-\u1FE1\u1FE5\u1FE8-\u1FEC\u1FF3\u1FF8-\u1FFC\u2126\u212A-\u212B\u2132\u214E\u2160-\u217F\u2183-\u2184\u24B6-\u24E9\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2C70\u2C72-\u2C73\u2C75-\u2C76\u2C7E-\u2CE3\u2CEB-\u2CEE\u2CF2-\u2CF3\u2D00-\u2D25\u2D27\u2D2D\uA640-\uA66D\uA680-\uA697\uA722-\uA72F\uA732-\uA76F\uA779-\uA787\uA78B-\uA78D\uA790-\uA793\uA7A0-\uA7AA\uFF21-\uFF3A\uFF41-\uFF5A'; 
// 
// 



function pad02 (c) { 
    return (Array(3).slice(c.length).join('0') + c).toUpperCase(); 
} 

function pad04 (c) { 
    return (Array(5).slice(c.length).join('0') + c).toUpperCase(); 
} 

// filter out unique integers 
function _intranges (node, pos, self) { 
    return _isint(node) && (pos <= self.indexOf(node)); 
} 

function _isint (n) { 
    return (n | 0) === n; 
} 

function _nsort (n1, n2) { 
    return n1 - n2; 
} 
// /eof 
+2

는 http://xregexp.com/ – Xotic750

+1

https://github.com/slevithan/xregexp/blob/master/xregexp-all.js#L3460 [자바 스크립트 + 유니 코드 (의 – Xotic750

+0

가능한 중복 HTTP를 살펴 보자 : //stackoverflow.com/questions/280712/javascript-unicode) –

답변

0

저는 정규식 equiv가 있다고 생각합니다. \p{L}의 경우 BabelMap 앱을 사용하여 생성했습니다. 그것은 {Ll, Lm, Lo, Lt, Lu} 세트 48K + 편지 쓰기 문자를 다룹니다

// JavaScript unicode letter regex: (4185 characters) 
letter_regex = /[\x41-\x5A\x61-\x7A\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u0527\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F2\u0620-\u064A\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u08A0\u08A2-\u08AC\u0904-\u0939\u093D\u0950\u0958-\u0961\u0971-\u0977\u0979-\u097F\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C3D\u0C58\u0C59\u0C60\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CF1\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D60\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0E01-\u0E30\u0E32\u0E33\u0E40-\u0E46\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD-\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6\u0EDC-\u0EDF\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065\u1066\u106E-\u1070\u1075-\u1081\u108E\u10A0-\u10C5\u10C7\u10CD\u10D0-\u10FA\u10FC-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F4\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7\u17DC\u1820-\u1877\u1880-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191C\u1950-\u196D\u1970-\u1974\u1980-\u19AB\u19C1-\u19C7\u1A00-\u1A16\u1A20-\u1A54\u1AA7\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C7D\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5\u1CF6\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2183\u2184\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2CF2\u2CF3\u2D00-\u2D25\u2D27\u2D2D\u2D30-\u2D67\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005\u3006\u3031-\u3035\u303B\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF\u3105-\u312D\u3131-\u318E\u31A0-\u31BA\u31F0-\u31FF\u3400-\u4DB5\u4E00-\u9FCC\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA61F\uA62A\uA62B\uA640-\uA66E\uA67F-\uA697\uA6A0-\uA6E5\uA717-\uA71F\uA722-\uA788\uA78B-\uA78E\uA790-\uA793\uA7A0-\uA7AA\uA7F8-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA76\uAA7A\uAA80-\uAAAF\uAAB1\uAAB5\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uABC0-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC]/; 
// 

내가 코드 포인트 > 0xffff하고 here 일치하는 문자를 (이 단일 SO 게시물에 덤프 많은 텍스트에의) 포함 버전을 게시했습니다.

관련 문제