有效地替换字符串中的所有重音字符?
对于一个穷人在客户端执行接近正确的排序,我需要一个JavaScript函数,可以在字符串中进行高效的单字符替换。
这是我的意思(注意,这适用于德语文本,其他语言排序不同):
本地排序得到错误:abcouzäöü 排序校正将是:a b bcoöuüz
基本上,我需要将所有出现的“ä”替换为“a”(以此类推)。 这样,本地排序的结果将非常接近用户期望的(或者数据库将返回的)。
其他语言也可以这样做: Python提供str.translate()
,在Perl中有tr/…/…/
, XPath有一个函数translate()
, ColdFusion有ReplaceList()
。 但是JavaScript呢?
这是我现在所拥有的。
// s would be a rather short string (something like // 200 characters at max, most of the time much less) function makeSortString(s) { var translate = { "ä": "a", "ö": "o", "ü": "u", "Ä": "A", "Ö": "O", "Ü": "U" // probably more to come }; var translate_re = /[öäüÖÄÜ]/g; return ( s.replace(translate_re, function(match) { return translate[match]; }) ); }
对于初学者来说,我不喜欢这个正则表达式在我每次调用这个函数的时候重建。 我想在这方面封闭可以有所帮助,但是由于某种原因,我似乎并没有得到这个结论。
有人可以想到更有效率的东西吗?
下面的答案分为两类:
- 不同程度的完整性和效率的字符串替换函数(我原来是在问)
- 后来提到了
String#localeCompare
,它在JS引擎中得到了广泛的支持,可以更加优雅地解决这个问题。
我不能说你正在试图做什么具体的功能本身,但如果你不喜欢每一次构建的正则表达式,这里有两个解决方案和一些关于每个的警告。
这是一个方法来做到这一点:
function makeSortString(s) { if(!makeSortString.translate_re) makeSortString.translate_re = /[öäüÖÄÜ]/g; var translate = { "ä": "a", "ö": "o", "ü": "u", "Ä": "A", "Ö": "O", "Ü": "U" // probably more to come }; return ( s.replace(makeSortString.translate_re, function(match) { return translate[match]; }) ); }
这显然会使正则表达式成为函数本身的属性。 你可能不喜欢这个(或者你可能,我想这取决于)的唯一的东西是正则表达式现在可以在函数体外进行修改。 所以,有人可以这样做来修改使用正则表达式的正则表达式:
makeSortString.translate_re = /[az]/g;
所以,有这个选择。
一种获得闭包的方法,从而防止某人修改正则表达式,将其定义为匿名函数赋值,如下所示:
var makeSortString = (function() { var translate_re = /[öäüÖÄÜ]/g; return function(s) { var translate = { "ä": "a", "ö": "o", "ü": "u", "Ä": "A", "Ö": "O", "Ü": "U" // probably more to come }; return ( s.replace(translate_re, function(match) { return translate[match]; }) ); } })();
希望这对你有用。
更新:这是早,我不知道为什么我以前没有看到明显的,但它也可能是有用的把你的translate
对象在关闭,以及:
var makeSortString = (function() { var translate_re = /[öäüÖÄÜ]/g; var translate = { "ä": "a", "ö": "o", "ü": "u", "Ä": "A", "Ö": "O", "Ü": "U" // probably more to come }; return function(s) { return ( s.replace(translate_re, function(match) { return translate[match]; }) ); } })();
这里是一个基于Unicode标准的更完整版本,取自这里: http : //semplicewebsites.com/removing-accents-javascript
var Latinise={};Latinise.latin_map={"Á":"A", "Ă":"A", "Ắ":"A", "Ặ":"A", "Ằ":"A", "Ẳ":"A", "Ẵ":"A", "Ǎ":"A", "Â":"A", "Ấ":"A", "Ậ":"A", "Ầ":"A", "Ẩ":"A", "Ẫ":"A", "Ä":"A", "Ǟ":"A", "Ȧ":"A", "Ǡ":"A", "Ạ":"A", "Ȁ":"A", "À":"A", "Ả":"A", "Ȃ":"A", "Ā":"A", "Ą":"A", "Å":"A", "Ǻ":"A", "Ḁ":"A", "Ⱥ":"A", "Ã":"A", "Ꜳ":"AA", "Æ":"AE", "Ǽ":"AE", "Ǣ":"AE", "Ꜵ":"AO", "Ꜷ":"AU", "Ꜹ":"AV", "Ꜻ":"AV", "Ꜽ":"AY", "Ḃ":"B", "Ḅ":"B", "Ɓ":"B", "Ḇ":"B", "Ƀ":"B", "Ƃ":"B", "Ć":"C", "Č":"C", "Ç":"C", "Ḉ":"C", "Ĉ":"C", "Ċ":"C", "Ƈ":"C", "Ȼ":"C", "Ď":"D", "Ḑ":"D", "Ḓ":"D", "Ḋ":"D", "Ḍ":"D", "Ɗ":"D", "Ḏ":"D", "Dz":"D", "Dž":"D", "Đ":"D", "Ƌ":"D", "DZ":"DZ", "DŽ":"DZ", "É":"E", "Ĕ":"E", "Ě":"E", "Ȩ":"E", "Ḝ":"E", "Ê":"E", "Ế":"E", "Ệ":"E", "Ề":"E", "Ể":"E", "Ễ":"E", "Ḙ":"E", "Ë":"E", "Ė":"E", "Ẹ":"E", "Ȅ":"E", "È":"E", "Ẻ":"E", "Ȇ":"E", "Ē":"E", "Ḗ":"E", "Ḕ":"E", "Ę":"E", "Ɇ":"E", "Ẽ":"E", "Ḛ":"E", "Ꝫ":"ET", "Ḟ":"F", "Ƒ":"F", "Ǵ":"G", "Ğ":"G", "Ǧ":"G", "Ģ":"G", "Ĝ":"G", "Ġ":"G", "Ɠ":"G", "Ḡ":"G", "Ǥ":"G", "Ḫ":"H", "Ȟ":"H", "Ḩ":"H", "Ĥ":"H", "Ⱨ":"H", "Ḧ":"H", "Ḣ":"H", "Ḥ":"H", "Ħ":"H", "Í":"I", "Ĭ":"I", "Ǐ":"I", "Î":"I", "Ï":"I", "Ḯ":"I", "İ":"I", "Ị":"I", "Ȉ":"I", "Ì":"I", "Ỉ":"I", "Ȋ":"I", "Ī":"I", "Į":"I", "Ɨ":"I", "Ĩ":"I", "Ḭ":"I", "Ꝺ":"D", "Ꝼ":"F", "Ᵹ":"G", "Ꞃ":"R", "Ꞅ":"S", "Ꞇ":"T", "Ꝭ":"IS", "Ĵ":"J", "Ɉ":"J", "Ḱ":"K", "Ǩ":"K", "Ķ":"K", "Ⱪ":"K", "Ꝃ":"K", "Ḳ":"K", "Ƙ":"K", "Ḵ":"K", "Ꝁ":"K", "Ꝅ":"K", "Ĺ":"L", "Ƚ":"L", "Ľ":"L", "Ļ":"L", "Ḽ":"L", "Ḷ":"L", "Ḹ":"L", "Ⱡ":"L", "Ꝉ":"L", "Ḻ":"L", "Ŀ":"L", "Ɫ":"L", "Lj":"L", "Ł":"L", "LJ":"LJ", "Ḿ":"M", "Ṁ":"M", "Ṃ":"M", "Ɱ":"M", "Ń":"N", "Ň":"N", "Ņ":"N", "Ṋ":"N", "Ṅ":"N", "Ṇ":"N", "Ǹ":"N", "Ɲ":"N", "Ṉ":"N", "Ƞ":"N", "Nj":"N", "Ñ":"N", "NJ":"NJ", "Ó":"O", "Ŏ":"O", "Ǒ":"O", "Ô":"O", "Ố":"O", "Ộ":"O", "Ồ":"O", "Ổ":"O", "Ỗ":"O", "Ö":"O", "Ȫ":"O", "Ȯ":"O", "Ȱ":"O", "Ọ":"O", "Ő":"O", "Ȍ":"O", "Ò":"O", "Ỏ":"O", "Ơ":"O", "Ớ":"O", "Ợ":"O", "Ờ":"O", "Ở":"O", "Ỡ":"O", "Ȏ":"O", "Ꝋ":"O", "Ꝍ":"O", "Ō":"O", "Ṓ":"O", "Ṑ":"O", "Ɵ":"O", "Ǫ":"O", "Ǭ":"O", "Ø":"O", "Ǿ":"O", "Õ":"O", "Ṍ":"O", "Ṏ":"O", "Ȭ":"O", "Ƣ":"OI", "Ꝏ":"OO", "Ɛ":"E", "Ɔ":"O", "Ȣ":"OU", "Ṕ":"P", "Ṗ":"P", "Ꝓ":"P", "Ƥ":"P", "Ꝕ":"P", "Ᵽ":"P", "Ꝑ":"P", "Ꝙ":"Q", "Ꝗ":"Q", "Ŕ":"R", "Ř":"R", "Ŗ":"R", "Ṙ":"R", "Ṛ":"R", "Ṝ":"R", "Ȑ":"R", "Ȓ":"R", "Ṟ":"R", "Ɍ":"R", "Ɽ":"R", "Ꜿ":"C", "Ǝ":"E", "Ś":"S", "Ṥ":"S", "Š":"S", "Ṧ":"S", "Ş":"S", "Ŝ":"S", "Ș":"S", "Ṡ":"S", "Ṣ":"S", "Ṩ":"S", "Ť":"T", "Ţ":"T", "Ṱ":"T", "Ț":"T", "Ⱦ":"T", "Ṫ":"T", "Ṭ":"T", "Ƭ":"T", "Ṯ":"T", "Ʈ":"T", "Ŧ":"T", "Ɐ":"A", "Ꞁ":"L", "Ɯ":"M", "Ʌ":"V", "Ꜩ":"TZ", "Ú":"U", "Ŭ":"U", "Ǔ":"U", "Û":"U", "Ṷ":"U", "Ü":"U", "Ǘ":"U", "Ǚ":"U", "Ǜ":"U", "Ǖ":"U", "Ṳ":"U", "Ụ":"U", "Ű":"U", "Ȕ":"U", "Ù":"U", "Ủ":"U", "Ư":"U", "Ứ":"U", "Ự":"U", "Ừ":"U", "Ử":"U", "Ữ":"U", "Ȗ":"U", "Ū":"U", "Ṻ":"U", "Ų":"U", "Ů":"U", "Ũ":"U", "Ṹ":"U", "Ṵ":"U", "Ꝟ":"V", "Ṿ":"V", "Ʋ":"V", "Ṽ":"V", "Ꝡ":"VY", "Ẃ":"W", "Ŵ":"W", "Ẅ":"W", "Ẇ":"W", "Ẉ":"W", "Ẁ":"W", "Ⱳ":"W", "Ẍ":"X", "Ẋ":"X", "Ý":"Y", "Ŷ":"Y", "Ÿ":"Y", "Ẏ":"Y", "Ỵ":"Y", "Ỳ":"Y", "Ƴ":"Y", "Ỷ":"Y", "Ỿ":"Y", "Ȳ":"Y", "Ɏ":"Y", "Ỹ":"Y", "Ź":"Z", "Ž":"Z", "Ẑ":"Z", "Ⱬ":"Z", "Ż":"Z", "Ẓ":"Z", "Ȥ":"Z", "Ẕ":"Z", "Ƶ":"Z", "IJ":"IJ", "Œ":"OE", "ᴀ":"A", "ᴁ":"AE", "ʙ":"B", "ᴃ":"B", "ᴄ":"C", "ᴅ":"D", "ᴇ":"E", "ꜰ":"F", "ɢ":"G", "ʛ":"G", "ʜ":"H", "ɪ":"I", "ʁ":"R", "ᴊ":"J", "ᴋ":"K", "ʟ":"L", "ᴌ":"L", "ᴍ":"M", "ɴ":"N", "ᴏ":"O", "ɶ":"OE", "ᴐ":"O", "ᴕ":"OU", "ᴘ":"P", "ʀ":"R", "ᴎ":"N", "ᴙ":"R", "ꜱ":"S", "ᴛ":"T", "ⱻ":"E", "ᴚ":"R", "ᴜ":"U", "ᴠ":"V", "ᴡ":"W", "ʏ":"Y", "ᴢ":"Z", "á":"a", "ă":"a", "ắ":"a", "ặ":"a", "ằ":"a", "ẳ":"a", "ẵ":"a", "ǎ":"a", "â":"a", "ấ":"a", "ậ":"a", "ầ":"a", "ẩ":"a", "ẫ":"a", "ä":"a", "ǟ":"a", "ȧ":"a", "ǡ":"a", "ạ":"a", "ȁ":"a", "à":"a", "ả":"a", "ȃ":"a", "ā":"a", "ą":"a", "ᶏ":"a", "ẚ":"a", "å":"a", "ǻ":"a", "ḁ":"a", "ⱥ":"a", "ã":"a", "ꜳ":"aa", "æ":"ae", "ǽ":"ae", "ǣ":"ae", "ꜵ":"ao", "ꜷ":"au", "ꜹ":"av", "ꜻ":"av", "ꜽ":"ay", "ḃ":"b", "ḅ":"b", "ɓ":"b", "ḇ":"b", "ᵬ":"b", "ᶀ":"b", "ƀ":"b", "ƃ":"b", "ɵ":"o", "ć":"c", "č":"c", "ç":"c", "ḉ":"c", "ĉ":"c", "ɕ":"c", "ċ":"c", "ƈ":"c", "ȼ":"c", "ď":"d", "ḑ":"d", "ḓ":"d", "ȡ":"d", "ḋ":"d", "ḍ":"d", "ɗ":"d", "ᶑ":"d", "ḏ":"d", "ᵭ":"d", "ᶁ":"d", "đ":"d", "ɖ":"d", "ƌ":"d", "ı":"i", "ȷ":"j", "ɟ":"j", "ʄ":"j", "dz":"dz", "dž":"dz", "é":"e", "ĕ":"e", "ě":"e", "ȩ":"e", "ḝ":"e", "ê":"e", "ế":"e", "ệ":"e", "ề":"e", "ể":"e", "ễ":"e", "ḙ":"e", "ë":"e", "ė":"e", "ẹ":"e", "ȅ":"e", "è":"e", "ẻ":"e", "ȇ":"e", "ē":"e", "ḗ":"e", "ḕ":"e", "ⱸ":"e", "ę":"e", "ᶒ":"e", "ɇ":"e", "ẽ":"e", "ḛ":"e", "ꝫ":"et", "ḟ":"f", "ƒ":"f", "ᵮ":"f", "ᶂ":"f", "ǵ":"g", "ğ":"g", "ǧ":"g", "ģ":"g", "ĝ":"g", "ġ":"g", "ɠ":"g", "ḡ":"g", "ᶃ":"g", "ǥ":"g", "ḫ":"h", "ȟ":"h", "ḩ":"h", "ĥ":"h", "ⱨ":"h", "ḧ":"h", "ḣ":"h", "ḥ":"h", "ɦ":"h", "ẖ":"h", "ħ":"h", "ƕ":"hv", "í":"i", "ĭ":"i", "ǐ":"i", "î":"i", "ï":"i", "ḯ":"i", "ị":"i", "ȉ":"i", "ì":"i", "ỉ":"i", "ȋ":"i", "ī":"i", "į":"i", "ᶖ":"i", "ɨ":"i", "ĩ":"i", "ḭ":"i", "ꝺ":"d", "ꝼ":"f", "ᵹ":"g", "ꞃ":"r", "ꞅ":"s", "ꞇ":"t", "ꝭ":"is", "ǰ":"j", "ĵ":"j", "ʝ":"j", "ɉ":"j", "ḱ":"k", "ǩ":"k", "ķ":"k", "ⱪ":"k", "ꝃ":"k", "ḳ":"k", "ƙ":"k", "ḵ":"k", "ᶄ":"k", "ꝁ":"k", "ꝅ":"k", "ĺ":"l", "ƚ":"l", "ɬ":"l", "ľ":"l", "ļ":"l", "ḽ":"l", "ȴ":"l", "ḷ":"l", "ḹ":"l", "ⱡ":"l", "ꝉ":"l", "ḻ":"l", "ŀ":"l", "ɫ":"l", "ᶅ":"l", "ɭ":"l", "ł":"l", "lj":"lj", "ſ":"s", "ẜ":"s", "ẛ":"s", "ẝ":"s", "ḿ":"m", "ṁ":"m", "ṃ":"m", "ɱ":"m", "ᵯ":"m", "ᶆ":"m", "ń":"n", "ň":"n", "ņ":"n", "ṋ":"n", "ȵ":"n", "ṅ":"n", "ṇ":"n", "ǹ":"n", "ɲ":"n", "ṉ":"n", "ƞ":"n", "ᵰ":"n", "ᶇ":"n", "ɳ":"n", "ñ":"n", "nj":"nj", "ó":"o", "ŏ":"o", "ǒ":"o", "ô":"o", "ố":"o", "ộ":"o", "ồ":"o", "ổ":"o", "ỗ":"o", "ö":"o", "ȫ":"o", "ȯ":"o", "ȱ":"o", "ọ":"o", "ő":"o", "ȍ":"o", "ò":"o", "ỏ":"o", "ơ":"o", "ớ":"o", "ợ":"o", "ờ":"o", "ở":"o", "ỡ":"o", "ȏ":"o", "ꝋ":"o", "ꝍ":"o", "ⱺ":"o", "ō":"o", "ṓ":"o", "ṑ":"o", "ǫ":"o", "ǭ":"o", "ø":"o", "ǿ":"o", "õ":"o", "ṍ":"o", "ṏ":"o", "ȭ":"o", "ƣ":"oi", "ꝏ":"oo", "ɛ":"e", "ᶓ":"e", "ɔ":"o", "ᶗ":"o", "ȣ":"ou", "ṕ":"p", "ṗ":"p", "ꝓ":"p", "ƥ":"p", "ᵱ":"p", "ᶈ":"p", "ꝕ":"p", "ᵽ":"p", "ꝑ":"p", "ꝙ":"q", "ʠ":"q", "ɋ":"q", "ꝗ":"q", "ŕ":"r", "ř":"r", "ŗ":"r", "ṙ":"r", "ṛ":"r", "ṝ":"r", "ȑ":"r", "ɾ":"r", "ᵳ":"r", "ȓ":"r", "ṟ":"r", "ɼ":"r", "ᵲ":"r", "ᶉ":"r", "ɍ":"r", "ɽ":"r", "ↄ":"c", "ꜿ":"c", "ɘ":"e", "ɿ":"r", "ś":"s", "ṥ":"s", "š":"s", "ṧ":"s", "ş":"s", "ŝ":"s", "ș":"s", "ṡ":"s", "ṣ":"s", "ṩ":"s", "ʂ":"s", "ᵴ":"s", "ᶊ":"s", "ȿ":"s", "ɡ":"g", "ᴑ":"o", "ᴓ":"o", "ᴝ":"u", "ť":"t", "ţ":"t", "ṱ":"t", "ț":"t", "ȶ":"t", "ẗ":"t", "ⱦ":"t", "ṫ":"t", "ṭ":"t", "ƭ":"t", "ṯ":"t", "ᵵ":"t", "ƫ":"t", "ʈ":"t", "ŧ":"t", "ᵺ":"th", "ɐ":"a", "ᴂ":"ae", "ǝ":"e", "ᵷ":"g", "ɥ":"h", "ʮ":"h", "ʯ":"h", "ᴉ":"i", "ʞ":"k", "ꞁ":"l", "ɯ":"m", "ɰ":"m", "ᴔ":"oe", "ɹ":"r", "ɻ":"r", "ɺ":"r", "ⱹ":"r", "ʇ":"t", "ʌ":"v", "ʍ":"w", "ʎ":"y", "ꜩ":"tz", "ú":"u", "ŭ":"u", "ǔ":"u", "û":"u", "ṷ":"u", "ü":"u", "ǘ":"u", "ǚ":"u", "ǜ":"u", "ǖ":"u", "ṳ":"u", "ụ":"u", "ű":"u", "ȕ":"u", "ù":"u", "ủ":"u", "ư":"u", "ứ":"u", "ự":"u", "ừ":"u", "ử":"u", "ữ":"u", "ȗ":"u", "ū":"u", "ṻ":"u", "ų":"u", "ᶙ":"u", "ů":"u", "ũ":"u", "ṹ":"u", "ṵ":"u", "ᵫ":"ue", "ꝸ":"um", "ⱴ":"v", "ꝟ":"v", "ṿ":"v", "ʋ":"v", "ᶌ":"v", "ⱱ":"v", "ṽ":"v", "ꝡ":"vy", "ẃ":"w", "ŵ":"w", "ẅ":"w", "ẇ":"w", "ẉ":"w", "ẁ":"w", "ⱳ":"w", "ẘ":"w", "ẍ":"x", "ẋ":"x", "ᶍ":"x", "ý":"y", "ŷ":"y", "ÿ":"y", "ẏ":"y", "ỵ":"y", "ỳ":"y", "ƴ":"y", "ỷ":"y", "ỿ":"y", "ȳ":"y", "ẙ":"y", "ɏ":"y", "ỹ":"y", "ź":"z", "ž":"z", "ẑ":"z", "ʑ":"z", "ⱬ":"z", "ż":"z", "ẓ":"z", "ȥ":"z", "ẕ":"z", "ᵶ":"z", "ᶎ":"z", "ʐ":"z", "ƶ":"z", "ɀ":"z", "ff":"ff", "ffi":"ffi", "ffl":"ffl", "fi":"fi", "fl":"fl", "ij":"ij", "œ":"oe", "st":"st", "ₐ":"a", "ₑ":"e", "ᵢ":"i", "ⱼ":"j", "ₒ":"o", "ᵣ":"r", "ᵤ":"u", "ᵥ":"v", "ₓ":"x"}; String.prototype.latinise=function(){return this.replace(/[^A-Za-z0-9\[\] ]/g,function(a){return Latinise.latin_map[a]||a})}; String.prototype.latinize=String.prototype.latinise; String.prototype.isLatin=function(){return this==this.latinise()}
一些例子:
> "Piqué".latinize(); "Pique" > "Piqué".isLatin(); false > "Pique".isLatin(); true > "Piqué".latinise().isLatin(); true
这种口音的正确术语是Diacritics 。 谷歌搜索这个词后,我发现这个功能是backbone.paginator
一部分。 它有一个非常完整的Diacritics集合,用最直观的ascii字符取代它们。 我发现这是当今最完整的Javascript解决方案。
全功能供将来参考:
function removeDiacritics (str) { var defaultDiacriticsRemovalMap = [ {'base':'A', 'letters':/[\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F]/g}, {'base':'AA','letters':/[\uA732]/g}, {'base':'AE','letters':/[\u00C6\u01FC\u01E2]/g}, {'base':'AO','letters':/[\uA734]/g}, {'base':'AU','letters':/[\uA736]/g}, {'base':'AV','letters':/[\uA738\uA73A]/g}, {'base':'AY','letters':/[\uA73C]/g}, {'base':'B', 'letters':/[\u0042\u24B7\uFF22\u1E02\u1E04\u1E06\u0243\u0182\u0181]/g}, {'base':'C', 'letters':/[\u0043\u24B8\uFF23\u0106\u0108\u010A\u010C\u00C7\u1E08\u0187\u023B\uA73E]/g}, {'base':'D', 'letters':/[\u0044\u24B9\uFF24\u1E0A\u010E\u1E0C\u1E10\u1E12\u1E0E\u0110\u018B\u018A\u0189\uA779]/g}, {'base':'DZ','letters':/[\u01F1\u01C4]/g}, {'base':'Dz','letters':/[\u01F2\u01C5]/g}, {'base':'E', 'letters':/[\u0045\u24BA\uFF25\u00C8\u00C9\u00CA\u1EC0\u1EBE\u1EC4\u1EC2\u1EBC\u0112\u1E14\u1E16\u0114\u0116\u00CB\u1EBA\u011A\u0204\u0206\u1EB8\u1EC6\u0228\u1E1C\u0118\u1E18\u1E1A\u0190\u018E]/g}, {'base':'F', 'letters':/[\u0046\u24BB\uFF26\u1E1E\u0191\uA77B]/g}, {'base':'G', 'letters':/[\u0047\u24BC\uFF27\u01F4\u011C\u1E20\u011E\u0120\u01E6\u0122\u01E4\u0193\uA7A0\uA77D\uA77E]/g}, {'base':'H', 'letters':/[\u0048\u24BD\uFF28\u0124\u1E22\u1E26\u021E\u1E24\u1E28\u1E2A\u0126\u2C67\u2C75\uA78D]/g}, {'base':'I', 'letters':/[\u0049\u24BE\uFF29\u00CC\u00CD\u00CE\u0128\u012A\u012C\u0130\u00CF\u1E2E\u1EC8\u01CF\u0208\u020A\u1ECA\u012E\u1E2C\u0197]/g}, {'base':'J', 'letters':/[\u004A\u24BF\uFF2A\u0134\u0248]/g}, {'base':'K', 'letters':/[\u004B\u24C0\uFF2B\u1E30\u01E8\u1E32\u0136\u1E34\u0198\u2C69\uA740\uA742\uA744\uA7A2]/g}, {'base':'L', 'letters':/[\u004C\u24C1\uFF2C\u013F\u0139\u013D\u1E36\u1E38\u013B\u1E3C\u1E3A\u0141\u023D\u2C62\u2C60\uA748\uA746\uA780]/g}, {'base':'LJ','letters':/[\u01C7]/g}, {'base':'Lj','letters':/[\u01C8]/g}, {'base':'M', 'letters':/[\u004D\u24C2\uFF2D\u1E3E\u1E40\u1E42\u2C6E\u019C]/g}, {'base':'N', 'letters':/[\u004E\u24C3\uFF2E\u01F8\u0143\u00D1\u1E44\u0147\u1E46\u0145\u1E4A\u1E48\u0220\u019D\uA790\uA7A4]/g}, {'base':'NJ','letters':/[\u01CA]/g}, {'base':'Nj','letters':/[\u01CB]/g}, {'base':'O', 'letters':/[\u004F\u24C4\uFF2F\u00D2\u00D3\u00D4\u1ED2\u1ED0\u1ED6\u1ED4\u00D5\u1E4C\u022C\u1E4E\u014C\u1E50\u1E52\u014E\u022E\u0230\u00D6\u022A\u1ECE\u0150\u01D1\u020C\u020E\u01A0\u1EDC\u1EDA\u1EE0\u1EDE\u1EE2\u1ECC\u1ED8\u01EA\u01EC\u00D8\u01FE\u0186\u019F\uA74A\uA74C]/g}, {'base':'OI','letters':/[\u01A2]/g}, {'base':'OO','letters':/[\uA74E]/g}, {'base':'OU','letters':/[\u0222]/g}, {'base':'P', 'letters':/[\u0050\u24C5\uFF30\u1E54\u1E56\u01A4\u2C63\uA750\uA752\uA754]/g}, {'base':'Q', 'letters':/[\u0051\u24C6\uFF31\uA756\uA758\u024A]/g}, {'base':'R', 'letters':/[\u0052\u24C7\uFF32\u0154\u1E58\u0158\u0210\u0212\u1E5A\u1E5C\u0156\u1E5E\u024C\u2C64\uA75A\uA7A6\uA782]/g}, {'base':'S', 'letters':/[\u0053\u24C8\uFF33\u1E9E\u015A\u1E64\u015C\u1E60\u0160\u1E66\u1E62\u1E68\u0218\u015E\u2C7E\uA7A8\uA784]/g}, {'base':'T', 'letters':/[\u0054\u24C9\uFF34\u1E6A\u0164\u1E6C\u021A\u0162\u1E70\u1E6E\u0166\u01AC\u01AE\u023E\uA786]/g}, {'base':'TZ','letters':/[\uA728]/g}, {'base':'U', 'letters':/[\u0055\u24CA\uFF35\u00D9\u00DA\u00DB\u0168\u1E78\u016A\u1E7A\u016C\u00DC\u01DB\u01D7\u01D5\u01D9\u1EE6\u016E\u0170\u01D3\u0214\u0216\u01AF\u1EEA\u1EE8\u1EEE\u1EEC\u1EF0\u1EE4\u1E72\u0172\u1E76\u1E74\u0244]/g}, {'base':'V', 'letters':/[\u0056\u24CB\uFF36\u1E7C\u1E7E\u01B2\uA75E\u0245]/g}, {'base':'VY','letters':/[\uA760]/g}, {'base':'W', 'letters':/[\u0057\u24CC\uFF37\u1E80\u1E82\u0174\u1E86\u1E84\u1E88\u2C72]/g}, {'base':'X', 'letters':/[\u0058\u24CD\uFF38\u1E8A\u1E8C]/g}, {'base':'Y', 'letters':/[\u0059\u24CE\uFF39\u1EF2\u00DD\u0176\u1EF8\u0232\u1E8E\u0178\u1EF6\u1EF4\u01B3\u024E\u1EFE]/g}, {'base':'Z', 'letters':/[\u005A\u24CF\uFF3A\u0179\u1E90\u017B\u017D\u1E92\u1E94\u01B5\u0224\u2C7F\u2C6B\uA762]/g}, {'base':'a', 'letters':/[\u0061\u24D0\uFF41\u1E9A\u00E0\u00E1\u00E2\u1EA7\u1EA5\u1EAB\u1EA9\u00E3\u0101\u0103\u1EB1\u1EAF\u1EB5\u1EB3\u0227\u01E1\u00E4\u01DF\u1EA3\u00E5\u01FB\u01CE\u0201\u0203\u1EA1\u1EAD\u1EB7\u1E01\u0105\u2C65\u0250]/g}, {'base':'aa','letters':/[\uA733]/g}, {'base':'ae','letters':/[\u00E6\u01FD\u01E3]/g}, {'base':'ao','letters':/[\uA735]/g}, {'base':'au','letters':/[\uA737]/g}, {'base':'av','letters':/[\uA739\uA73B]/g}, {'base':'ay','letters':/[\uA73D]/g}, {'base':'b', 'letters':/[\u0062\u24D1\uFF42\u1E03\u1E05\u1E07\u0180\u0183\u0253]/g}, {'base':'c', 'letters':/[\u0063\u24D2\uFF43\u0107\u0109\u010B\u010D\u00E7\u1E09\u0188\u023C\uA73F\u2184]/g}, {'base':'d', 'letters':/[\u0064\u24D3\uFF44\u1E0B\u010F\u1E0D\u1E11\u1E13\u1E0F\u0111\u018C\u0256\u0257\uA77A]/g}, {'base':'dz','letters':/[\u01F3\u01C6]/g}, {'base':'e', 'letters':/[\u0065\u24D4\uFF45\u00E8\u00E9\u00EA\u1EC1\u1EBF\u1EC5\u1EC3\u1EBD\u0113\u1E15\u1E17\u0115\u0117\u00EB\u1EBB\u011B\u0205\u0207\u1EB9\u1EC7\u0229\u1E1D\u0119\u1E19\u1E1B\u0247\u025B\u01DD]/g}, {'base':'f', 'letters':/[\u0066\u24D5\uFF46\u1E1F\u0192\uA77C]/g}, {'base':'g', 'letters':/[\u0067\u24D6\uFF47\u01F5\u011D\u1E21\u011F\u0121\u01E7\u0123\u01E5\u0260\uA7A1\u1D79\uA77F]/g}, {'base':'h', 'letters':/[\u0068\u24D7\uFF48\u0125\u1E23\u1E27\u021F\u1E25\u1E29\u1E2B\u1E96\u0127\u2C68\u2C76\u0265]/g}, {'base':'hv','letters':/[\u0195]/g}, {'base':'i', 'letters':/[\u0069\u24D8\uFF49\u00EC\u00ED\u00EE\u0129\u012B\u012D\u00EF\u1E2F\u1EC9\u01D0\u0209\u020B\u1ECB\u012F\u1E2D\u0268\u0131]/g}, {'base':'j', 'letters':/[\u006A\u24D9\uFF4A\u0135\u01F0\u0249]/g}, {'base':'k', 'letters':/[\u006B\u24DA\uFF4B\u1E31\u01E9\u1E33\u0137\u1E35\u0199\u2C6A\uA741\uA743\uA745\uA7A3]/g}, {'base':'l', 'letters':/[\u006C\u24DB\uFF4C\u0140\u013A\u013E\u1E37\u1E39\u013C\u1E3D\u1E3B\u017F\u0142\u019A\u026B\u2C61\uA749\uA781\uA747]/g}, {'base':'lj','letters':/[\u01C9]/g}, {'base':'m', 'letters':/[\u006D\u24DC\uFF4D\u1E3F\u1E41\u1E43\u0271\u026F]/g}, {'base':'n', 'letters':/[\u006E\u24DD\uFF4E\u01F9\u0144\u00F1\u1E45\u0148\u1E47\u0146\u1E4B\u1E49\u019E\u0272\u0149\uA791\uA7A5]/g}, {'base':'nj','letters':/[\u01CC]/g}, {'base':'o', 'letters':/[\u006F\u24DE\uFF4F\u00F2\u00F3\u00F4\u1ED3\u1ED1\u1ED7\u1ED5\u00F5\u1E4D\u022D\u1E4F\u014D\u1E51\u1E53\u014F\u022F\u0231\u00F6\u022B\u1ECF\u0151\u01D2\u020D\u020F\u01A1\u1EDD\u1EDB\u1EE1\u1EDF\u1EE3\u1ECD\u1ED9\u01EB\u01ED\u00F8\u01FF\u0254\uA74B\uA74D\u0275]/g}, {'base':'oi','letters':/[\u01A3]/g}, {'base':'ou','letters':/[\u0223]/g}, {'base':'oo','letters':/[\uA74F]/g}, {'base':'p','letters':/[\u0070\u24DF\uFF50\u1E55\u1E57\u01A5\u1D7D\uA751\uA753\uA755]/g}, {'base':'q','letters':/[\u0071\u24E0\uFF51\u024B\uA757\uA759]/g}, {'base':'r','letters':/[\u0072\u24E1\uFF52\u0155\u1E59\u0159\u0211\u0213\u1E5B\u1E5D\u0157\u1E5F\u024D\u027D\uA75B\uA7A7\uA783]/g}, {'base':'s','letters':/[\u0073\u24E2\uFF53\u00DF\u015B\u1E65\u015D\u1E61\u0161\u1E67\u1E63\u1E69\u0219\u015F\u023F\uA7A9\uA785\u1E9B]/g}, {'base':'t','letters':/[\u0074\u24E3\uFF54\u1E6B\u1E97\u0165\u1E6D\u021B\u0163\u1E71\u1E6F\u0167\u01AD\u0288\u2C66\uA787]/g}, {'base':'tz','letters':/[\uA729]/g}, {'base':'u','letters':/[\u0075\u24E4\uFF55\u00F9\u00FA\u00FB\u0169\u1E79\u016B\u1E7B\u016D\u00FC\u01DC\u01D8\u01D6\u01DA\u1EE7\u016F\u0171\u01D4\u0215\u0217\u01B0\u1EEB\u1EE9\u1EEF\u1EED\u1EF1\u1EE5\u1E73\u0173\u1E77\u1E75\u0289]/g}, {'base':'v','letters':/[\u0076\u24E5\uFF56\u1E7D\u1E7F\u028B\uA75F\u028C]/g}, {'base':'vy','letters':/[\uA761]/g}, {'base':'w','letters':/[\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73]/g}, {'base':'x','letters':/[\u0078\u24E7\uFF58\u1E8B\u1E8D]/g}, {'base':'y','letters':/[\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF]/g}, {'base':'z','letters':/[\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763]/g} ]; for(var i=0; i<defaultDiacriticsRemovalMap.length; i++) { str = str.replace(defaultDiacriticsRemovalMap[i].letters, defaultDiacriticsRemovalMap[i].base); } return str; }
基于Jason Bunting的解决方案,这里是我现在使用的。
整个事情是为jQuery tablesorter插件 :为了(几乎正确)使用tablesorter插件排序非英语表,有必要使用自定义的textExtraction
函数 。
这个:
- 将最常见的重音字母翻译成不重叠的字母(支持的字母列表很容易扩展)
- 将德文格式的日期(
'dd.mm.yyyy'
)更改为可识别的格式('yyyy-mm-dd'
)
注意以UTF-8编码保存JavaScript文件,否则将无法正常工作。
// file encoding must be UTF-8! function getTextExtractor() { return (function() { var patternLetters = /[öäüÖÄÜáàâéèêúùûóòôÁÀÂÉÈÊÚÙÛÓÒÔß]/g; var patternDateDmy = /^(?:\D+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})$/; var lookupLetters = { "ä": "a", "ö": "o", "ü": "u", "Ä": "A", "Ö": "O", "Ü": "U", "á": "a", "à": "a", "â": "a", "é": "e", "è": "e", "ê": "e", "ú": "u", "ù": "u", "û": "u", "ó": "o", "ò": "o", "ô": "o", "Á": "A", "À": "A", "Â": "A", "É": "E", "È": "E", "Ê": "E", "Ú": "U", "Ù": "U", "Û": "U", "Ó": "O", "Ò": "O", "Ô": "O", "ß": "s" }; var letterTranslator = function(match) { return lookupLetters[match] || match; } return function(node) { var text = $.trim($(node).text()); var date = text.match(patternDateDmy); if (date) return [date[3], date[2], date[1]].join("-"); else return text.replace(patternLetters, letterTranslator); } })(); }
你可以像这样使用它:
$("table.sortable").tablesorter({ textExtraction: getTextExtractor() });
我认为这可能会更清洁/更好(虽然我没有测试它的性能):
String.prototype.stripAccents = function() { var translate_re = /[àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ]/g; var translate = 'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY'; return (this.replace(translate_re, function(match){ return translate.substr(translate_re.source.indexOf(match)-1, 1); }) ); };
或者如果你仍然担心业绩,那么我们就可以得到两全其美的好处:
String.prototype.stripAccents = function() { var in_chrs = 'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ', out_chrs = 'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY', transl = {}; eval('var chars_rgx = /['+in_chrs+']/g'); for(var i = 0; i < in_chrs.length; i++){ transl[in_chrs.charAt(i)] = out_chrs.charAt(i); } return this.replace(chars_rgx, function(match){ return transl[match]; }); };
编辑 (@Tomalak)
我很欣赏这个想法。 但是,如下面的评论所述,实施有几个问题。
这是我将如何实施它。
var stripAccents = (function () { var in_chrs = 'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ', out_chrs = 'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY', chars_rgx = new RegExp('[' + in_chrs + ']', 'g'), transl = {}, i, lookup = function (m) { return transl[m] || m; }; for (i=0; i<in_chrs.length; i++) { transl[ in_chrs[i] ] = out_chrs[i]; } return function (s) { return s.replace(chars_rgx, lookup); } })();
Simply should be normalized chain and run a replacement codes:
var str = "Letras Á É Í Ó Ú Ñ - á é í ó ú ñ..."; console.log (str.normalize ("NFKD").replace (/[\u0300-\u036F]/g, "")); // Letras AEIOUN - aeiou n...
See normalize
Then you can use this function:
function noTilde (s) { if (s.normalize != undefined) { s = s.normalize ("NFKD"); } return s.replace (/[\u0300-\u036F]/g, ""); }
The complete solution to your request is:
function convert_accented_characters(str){ var conversions = new Object(); conversions['ae'] = 'ä|æ|ǽ'; conversions['oe'] = 'ö|œ'; conversions['ue'] = 'ü'; conversions['Ae'] = 'Ä'; conversions['Ue'] = 'Ü'; conversions['Oe'] = 'Ö'; conversions['A'] = 'À|Á|Â|Ã|Ä|Å|Ǻ|Ā|Ă|Ą|Ǎ'; conversions['a'] = 'à|á|â|ã|å|ǻ|ā|ă|ą|ǎ|ª'; conversions['C'] = 'Ç|Ć|Ĉ|Ċ|Č'; conversions['c'] = 'ç|ć|ĉ|ċ|č'; conversions['D'] = 'Ð|Ď|Đ'; conversions['d'] = 'ð|ď|đ'; conversions['E'] = 'È|É|Ê|Ë|Ē|Ĕ|Ė|Ę|Ě'; conversions['e'] = 'è|é|ê|ë|ē|ĕ|ė|ę|ě'; conversions['G'] = 'Ĝ|Ğ|Ġ|Ģ'; conversions['g'] = 'ĝ|ğ|ġ|ģ'; conversions['H'] = 'Ĥ|Ħ'; conversions['h'] = 'ĥ|ħ'; conversions['I'] = 'Ì|Í|Î|Ï|Ĩ|Ī|Ĭ|Ǐ|Į|İ'; conversions['i'] = 'ì|í|î|ï|ĩ|ī|ĭ|ǐ|į|ı'; conversions['J'] = 'Ĵ'; conversions['j'] = 'ĵ'; conversions['K'] = 'Ķ'; conversions['k'] = 'ķ'; conversions['L'] = 'Ĺ|Ļ|Ľ|Ŀ|Ł'; conversions['l'] = 'ĺ|ļ|ľ|ŀ|ł'; conversions['N'] = 'Ñ|Ń|Ņ|Ň'; conversions['n'] = 'ñ|ń|ņ|ň|ʼn'; conversions['O'] = 'Ò|Ó|Ô|Õ|Ō|Ŏ|Ǒ|Ő|Ơ|Ø|Ǿ'; conversions['o'] = 'ò|ó|ô|õ|ō|ŏ|ǒ|ő|ơ|ø|ǿ|º'; conversions['R'] = 'Ŕ|Ŗ|Ř'; conversions['r'] = 'ŕ|ŗ|ř'; conversions['S'] = 'Ś|Ŝ|Ş|Š'; conversions['s'] = 'ś|ŝ|ş|š|ſ'; conversions['T'] = 'Ţ|Ť|Ŧ'; conversions['t'] = 'ţ|ť|ŧ'; conversions['U'] = 'Ù|Ú|Û|Ũ|Ū|Ŭ|Ů|Ű|Ų|Ư|Ǔ|Ǖ|Ǘ|Ǚ|Ǜ'; conversions['u'] = 'ù|ú|û|ũ|ū|ŭ|ů|ű|ų|ư|ǔ|ǖ|ǘ|ǚ|ǜ'; conversions['Y'] = 'Ý|Ÿ|Ŷ'; conversions['y'] = 'ý|ÿ|ŷ'; conversions['W'] = 'Ŵ'; conversions['w'] = 'ŵ'; conversions['Z'] = 'Ź|Ż|Ž'; conversions['z'] = 'ź|ż|ž'; conversions['AE'] = 'Æ|Ǽ'; conversions['ss'] = 'ß'; conversions['IJ'] = 'IJ'; conversions['ij'] = 'ij'; conversions['OE'] = 'Œ'; conversions['f'] = 'ƒ'; for(var i in conversions){ var re = new RegExp(conversions[i],"g"); str = str.replace(re,i); } return str;
}
I made a Prototype Version of this:
String.prototype.strip = function() { var translate_re = /[öäüÖÄÜß ]/g; var translate = { "ä":"a", "ö":"o", "ü":"u", "Ä":"A", "Ö":"O", "Ü":"U", " ":"_", "ß":"ss" // probably more to come }; return (this.replace(translate_re, function(match){ return translate[match];}) ); };
Use like:
var teststring = 'ä ö ü Ä Ö Ü ß'; teststring.strip();
This will will change the String to a_o_u_A_O_U_ss
If you're looking specifically for a way to convert accented characters to non-accented characters, rather than a way to sort accented characters, with a little finagling, the String.localeCompare function can be manipulated to find the basic latin characters that match the extended ones. For example, you might want to produce a human friendly url slug from a page title. If so, you can do something like this:
var baseChars = []; for (var i = 97; i < 97 + 26; i++) { baseChars.push(String.fromCharCode(i)); } //if needed, handle fancy compound characters baseChars = baseChars.concat('ss,aa,ae,ao,au,av,ay,dz,hv,lj,nj,oi,ou,oo,tz,vy'.split(',')); function isUpperCase(c) { return c !== c.toLocaleLowerCase() } function toBaseChar(c, opts) { opts = opts || {}; //if (!('nonAlphaChar' in opts)) opts.nonAlphaChar = ''; //if (!('noMatchChar' in opts)) opts.noMatchChar = ''; if (!('locale' in opts)) opts.locale = 'en'; var cOpts = {sensitivity: 'base'}; //exit early for any non-alphabetical character if (c.localeCompare('9', opts.locale, cOpts) <= 0) return opts.nonAlphaChar === undefined ? c : opts.nonAlphaChar; for (var i = 0; i < baseChars.length; i++) { var baseChar = baseChars[i]; var comp = c.localeCompare(baseChar, opts.locale, cOpts); if (comp == 0) return (isUpperCase(c)) ? baseChar.toUpperCase() : baseChar; } return opts.noMatchChar === undefined ? c : opts.noMatchChar; } function latinify(str, opts) { return str.replace(/[^\w\s\d]/g, function(c) { return toBaseChar(c, opts); }) } // Example: console.log(latinify('Čeština Tsėhesenėstsestotse Tshivenḓa Emigliàn–Rumagnòl Slovenščina Português Tiếng Việt Straße')) // "Cestina Tsehesenestsestotse Tshivenda Emiglian–Rumagnol Slovenscina Portugues Tieng Viet Strasse"
A direct port to javascript of Kierons solution: https://github.com/rwarasaurus/nano/blob/master/system/helpers.php#L61-73 :
/** * Normalise a string replacing foreign characters * * @param {String} str * @return {String} str */ var normalize = (function () { var a = ['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ', 'Č', 'č', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'Ė', 'ė', 'Ę', 'ę', 'Ě', 'ě', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'Ġ', 'ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'IJ', 'ij', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'Ĺ', 'ĺ', 'Ļ', 'ļ', 'Ľ', 'ľ', 'Ŀ', 'ŀ', 'Ł', 'ł', 'Ń', 'ń', 'Ņ', 'ņ', 'Ň', 'ň', 'ʼn', 'Ō', 'ō', 'Ŏ', 'ŏ', 'Ő', 'ő', 'Œ', 'œ', 'Ŕ', 'ŕ', 'Ŗ', 'ŗ', 'Ř', 'ř', 'Ś', 'ś', 'Ŝ', 'ŝ', 'Ş', 'ş', 'Š', 'š', 'Ţ', 'ţ', 'Ť', 'ť', 'Ŧ', 'ŧ', 'Ũ', 'ũ', 'Ū', 'ū', 'Ŭ', 'ŭ', 'Ů', 'ů', 'Ű', 'ű', 'Ų', 'ų', 'Ŵ', 'ŵ', 'Ŷ', 'ŷ', 'Ÿ', 'Ź', 'ź', 'Ż', 'ż', 'Ž', 'ž', 'ſ', 'ƒ', 'Ơ', 'ơ', 'Ư', 'ư', 'Ǎ', 'ǎ', 'Ǐ', 'ǐ', 'Ǒ', 'ǒ', 'Ǔ', 'ǔ', 'Ǖ', 'ǖ', 'Ǘ', 'ǘ', 'Ǚ', 'ǚ', 'Ǜ', 'ǜ', 'Ǻ', 'ǻ', 'Ǽ', 'ǽ', 'Ǿ', 'ǿ']; var b = ['A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'O', 'U', 'U', 'U', 'U', 'Y', 's', 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 'n', 'o', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'u', 'y', 'y', 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'IJ', 'ij', 'J', 'j', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'l', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'O', 'o', 'O', 'o', 'O', 'o', 'OE', 'oe', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's', 'f', 'O', 'o', 'U', 'u', 'A', 'a', 'I', 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'A', 'a', 'AE', 'ae', 'O', 'o']; return function (str) { var i = a.length; while (i--) str = str.replace(a[i], b[i]); return str; }; }());
And a slightly modified version, using a char-map instead of two arrays:
To compare these two methods I made a simple benchmark: http://jsperf.com/replace-foreign-characters
/** * Normalise a string replacing foreign characters * * @param {String} str * @return {String} */ var normalize = (function () { var map = { "À": "A", "Á": "A", "Â": "A", "Ã": "A", "Ä": "A", "Å": "A", "Æ": "AE", "Ç": "C", "È": "E", "É": "E", "Ê": "E", "Ë": "E", "Ì": "I", "Í": "I", "Î": "I", "Ï": "I", "Ð": "D", "Ñ": "N", "Ò": "O", "Ó": "O", "Ô": "O", "Õ": "O", "Ö": "O", "Ø": "O", "Ù": "U", "Ú": "U", "Û": "U", "Ü": "U", "Ý": "Y", "ß": "s", "à": "a", "á": "a", "â": "a", "ã": "a", "ä": "a", "å": "a", "æ": "ae", "ç": "c", "è": "e", "é": "e", "ê": "e", "ë": "e", "ì": "i", "í": "i", "î": "i", "ï": "i", "ñ": "n", "ò": "o", "ó": "o", "ô": "o", "õ": "o", "ö": "o", "ø": "o", "ù": "u", "ú": "u", "û": "u", "ü": "u", "ý": "y", "ÿ": "y", "Ā": "A", "ā": "a", "Ă": "A", "ă": "a", "Ą": "A", "ą": "a", "Ć": "C", "ć": "c", "Ĉ": "C", "ĉ": "c", "Ċ": "C", "ċ": "c", "Č": "C", "č": "c", "Ď": "D", "ď": "d", "Đ": "D", "đ": "d", "Ē": "E", "ē": "e", "Ĕ": "E", "ĕ": "e", "Ė": "E", "ė": "e", "Ę": "E", "ę": "e", "Ě": "E", "ě": "e", "Ĝ": "G", "ĝ": "g", "Ğ": "G", "ğ": "g", "Ġ": "G", "ġ": "g", "Ģ": "G", "ģ": "g", "Ĥ": "H", "ĥ": "h", "Ħ": "H", "ħ": "h", "Ĩ": "I", "ĩ": "i", "Ī": "I", "ī": "i", "Ĭ": "I", "ĭ": "i", "Į": "I", "į": "i", "İ": "I", "ı": "i", "IJ": "IJ", "ij": "ij", "Ĵ": "J", "ĵ": "j", "Ķ": "K", "ķ": "k", "Ĺ": "L", "ĺ": "l", "Ļ": "L", "ļ": "l", "Ľ": "L", "ľ": "l", "Ŀ": "L", "ŀ": "l", "Ł": "l", "ł": "l", "Ń": "N", "ń": "n", "Ņ": "N", "ņ": "n", "Ň": "N", "ň": "n", "ʼn": "n", "Ō": "O", "ō": "o", "Ŏ": "O", "ŏ": "o", "Ő": "O", "ő": "o", "Œ": "OE", "œ": "oe", "Ŕ": "R", "ŕ": "r", "Ŗ": "R", "ŗ": "r", "Ř": "R", "ř": "r", "Ś": "S", "ś": "s", "Ŝ": "S", "ŝ": "s", "Ş": "S", "ş": "s", "Š": "S", "š": "s", "Ţ": "T", "ţ": "t", "Ť": "T", "ť": "t", "Ŧ": "T", "ŧ": "t", "Ũ": "U", "ũ": "u", "Ū": "U", "ū": "u", "Ŭ": "U", "ŭ": "u", "Ů": "U", "ů": "u", "Ű": "U", "ű": "u", "Ų": "U", "ų": "u", "Ŵ": "W", "ŵ": "w", "Ŷ": "Y", "ŷ": "y", "Ÿ": "Y", "Ź": "Z", "ź": "z", "Ż": "Z", "ż": "z", "Ž": "Z", "ž": "z", "ſ": "s", "ƒ": "f", "Ơ": "O", "ơ": "o", "Ư": "U", "ư": "u", "Ǎ": "A", "ǎ": "a", "Ǐ": "I", "ǐ": "i", "Ǒ": "O", "ǒ": "o", "Ǔ": "U", "ǔ": "u", "Ǖ": "U", "ǖ": "u", "Ǘ": "U", "ǘ": "u", "Ǚ": "U", "ǚ": "u", "Ǜ": "U", "ǜ": "u", "Ǻ": "A", "ǻ": "a", "Ǽ": "AE", "ǽ": "ae", "Ǿ": "O", "ǿ": "o" }, nonWord = /\W/g, mapping = function (c) { return map[c] || c; }; return function (str) { return str.replace(nonWord, mapping); }; }());
I just wanted to post my solution using String#localeCompare
const base_chars = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', '_', ' ' ]; const fix = str => str.normalize('NFKD').split('') .map(c => base_chars.find(bc => bc.localeCompare(c, 'en', { sensitivity: 'base' })==0)) .join(''); const str = 'OÒ óëå-123'; console.log(`fix(${str}) = ${fix(str)}`);
Long time ago I did this in Java and found someone else's solution based on a single string that captures part of the Unicode table that was important for the conversion – the rest was converted to ? or any other replacement character. So I tried to convert it to JavaScript. Mind that I'm no JS expert. 🙂
TAB_00C0 = "AAAAAAACEEEEIIII" + "DNOOOOO*OUUUUYIs" + "aaaaaaaceeeeiiii" + "?nooooo/ouuuuy?y" + "AaAaAaCcCcCcCcDd" + "DdEeEeEeEeEeGgGg" + "GgGgHhHhIiIiIiIi" + "IiJjJjKkkLlLlLlL" + "lLlNnNnNnnNnOoOo" + "OoOoRrRrRrSsSsSs" + "SsTtTtTtUuUuUuUu" + "UuUuWwYyYZzZzZzF"; function stripDiacritics(source) { var result = source.split(''); for (var i = 0; i < result.length; i++) { var c = source.charCodeAt(i); if (c >= 0x00c0 && c <= 0x017f) { result[i] = String.fromCharCode(TAB_00C0.charCodeAt(c - 0x00c0)); } else if (c > 127) { result[i] = '?'; } } return result.join(''); } stripDiacritics("Šupa, čo? ľšťčžýæøåℌð")
This converts most of latin1+2 Unicode characters. It is not able to translate single char to multiple. I don't know its performance on JS, in Java this is by far the fastest of common solutions (6-50x), there is no map, there is no regex, nothing. It produces strict ASCII output, potentially with a loss of information, but the size of the output matches the input.
I tested the snippet with http://www.webtoolkitonline.com/javascript-tester.html and it produced Supa, co? lstczyaoa??
如预期。
Basing on existing answers and some suggestions, I've created this one:
String.prototype.removeAccents = function() { var removalMap = { 'A' : /[AⒶAÀÁÂẦẤẪẨÃĀĂẰẮẴẲȦǠÄǞẢÅǺǍȀȂẠẬẶḀĄ]/g, 'AA' : /[Ꜳ]/g, 'AE' : /[ÆǼǢ]/g, 'AO' : /[Ꜵ]/g, 'AU' : /[Ꜷ]/g, 'AV' : /[ꜸꜺ]/g, 'AY' : /[Ꜽ]/g, 'B' : /[BⒷBḂḄḆɃƂƁ]/g, 'C' : /[CⒸCĆĈĊČÇḈƇȻꜾ]/g, 'D' : /[DⒹDḊĎḌḐḒḎĐƋƊƉꝹ]/g, 'DZ' : /[DZDŽ]/g, 'Dz' : /[DzDž]/g, 'E' : /[EⒺEÈÉÊỀẾỄỂẼĒḔḖĔĖËẺĚȄȆẸỆȨḜĘḘḚƐƎ]/g, 'F' : /[FⒻFḞƑꝻ]/g, 'G' : /[GⒼGǴĜḠĞĠǦĢǤƓꞠꝽꝾ]/g, 'H' : /[HⒽHĤḢḦȞḤḨḪĦⱧⱵꞍ]/g, 'I' : /[IⒾIÌÍÎĨĪĬİÏḮỈǏȈȊỊĮḬƗ]/g, 'J' : /[JⒿJĴɈ]/g, 'K' : /[KⓀKḰǨḲĶḴƘⱩꝀꝂꝄꞢ]/g, 'L' : /[LⓁLĿĹĽḶḸĻḼḺŁȽⱢⱠꝈꝆꞀ]/g, 'LJ' : /[LJ]/g, 'Lj' : /[Lj]/g, 'M' : /[MⓂMḾṀṂⱮƜ]/g, 'N' : /[NⓃNǸŃÑṄŇṆŅṊṈȠƝꞐꞤ]/g, 'NJ' : /[NJ]/g, 'Nj' : /[Nj]/g, 'O' : /[OⓄOÒÓÔỒỐỖỔÕṌȬṎŌṐṒŎȮȰÖȪỎŐǑȌȎƠỜỚỠỞỢỌỘǪǬØǾƆƟꝊꝌ]/g, 'OI' : /[Ƣ]/g, 'OO' : /[Ꝏ]/g, 'OU' : /[Ȣ]/g, 'P' : /[PⓅPṔṖƤⱣꝐꝒꝔ]/g, 'Q' : /[QⓆQꝖꝘɊ]/g, 'R' : /[RⓇRŔṘŘȐȒṚṜŖṞɌⱤꝚꞦꞂ]/g, 'S' : /[SⓈSẞŚṤŜṠŠṦṢṨȘŞⱾꞨꞄ]/g, 'T' : /[TⓉTṪŤṬȚŢṰṮŦƬƮȾꞆ]/g, 'TZ' : /[Ꜩ]/g, 'U' : /[UⓊUÙÚÛŨṸŪṺŬÜǛǗǕǙỦŮŰǓȔȖƯỪỨỮỬỰỤṲŲṶṴɄ]/g, 'V' : /[VⓋVṼṾƲꝞɅ]/g, 'VY' : /[Ꝡ]/g, 'W' : /[WⓌWẀẂŴẆẄẈⱲ]/g, 'X' : /[XⓍXẊẌ]/g, 'Y' : /[YⓎYỲÝŶỸȲẎŸỶỴƳɎỾ]/g, 'Z' : /[ZⓏZŹẐŻŽẒẔƵȤⱿⱫꝢ]/g, 'a' : /[aⓐaẚàáâầấẫẩãāăằắẵẳȧǡäǟảåǻǎȁȃạậặḁąⱥɐ]/g, 'aa' : /[ꜳ]/g, 'ae' : /[æǽǣ]/g, 'ao' : /[ꜵ]/g, 'au' : /[ꜷ]/g, 'av' : /[ꜹꜻ]/g, 'ay' : /[ꜽ]/g, 'b' : /[bⓑbḃḅḇƀƃɓ]/g, 'c' : /[cⓒcćĉċčçḉƈȼꜿↄ]/g, 'd' : /[dⓓdḋďḍḑḓḏđƌɖɗꝺ]/g, 'dz' : /[dzdž]/g, 'e' : /[eⓔeèéêềếễểẽēḕḗĕėëẻěȅȇẹệȩḝęḙḛɇɛǝ]/g, 'f' : /[fⓕfḟƒꝼ]/g, 'g' : /[gⓖgǵĝḡğġǧģǥɠꞡᵹꝿ]/g, 'h' : /[hⓗhĥḣḧȟḥḩḫẖħⱨⱶɥ]/g, 'hv' : /[ƕ]/g, 'i' : /[iⓘiìíîĩīĭïḯỉǐȉȋịįḭɨı]/g, 'j' : /[jⓙjĵǰɉ]/g, 'k' : /[kⓚkḱǩḳķḵƙⱪꝁꝃꝅꞣ]/g, 'l' : /[lⓛlŀĺľḷḹļḽḻſłƚɫⱡꝉꞁꝇ]/g, 'lj' : /[lj]/g, 'm' : /[mⓜmḿṁṃɱɯ]/g, 'n' : /[nⓝnǹńñṅňṇņṋṉƞɲʼnꞑꞥ]/g, 'nj' : /[nj]/g, 'o' : /[oⓞoòóôồốỗổõṍȭṏōṑṓŏȯȱöȫỏőǒȍȏơờớỡởợọộǫǭøǿɔꝋꝍɵ]/g, 'oi' : /[ƣ]/g, 'ou' : /[ȣ]/g, 'oo' : /[ꝏ]/g, 'p' : /[pⓟpṕṗƥᵽꝑꝓꝕ]/g, 'q' : /[qⓠqɋꝗꝙ]/g, 'r' : /[rⓡrŕṙřȑȓṛṝŗṟɍɽꝛꞧꞃ]/g, 's' : /[sⓢsßśṥŝṡšṧṣṩșşȿꞩꞅẛ]/g, 't' : /[tⓣtṫẗťṭțţṱṯŧƭʈⱦꞇ]/g, 'tz' : /[ꜩ]/g, 'u' : /[uⓤuùúûũṹūṻŭüǜǘǖǚủůűǔȕȗưừứữửựụṳųṷṵʉ]/g, 'v' : /[vⓥvṽṿʋꝟʌ]/g, 'vy' : /[ꝡ]/g, 'w' : /[wⓦwẁẃŵẇẅẘẉⱳ]/g, 'x' : /[xⓧxẋẍ]/g, 'y' : /[yⓨyỳýŷỹȳẏÿỷẙỵƴɏỿ]/g, 'z' : /[zⓩzźẑżžẓẕƶȥɀⱬꝣ]/g, }; var str = this; for(var latin in removalMap) { var nonLatin = removalMap[latin]; str = str.replace(nonLatin , latin); } return str; }
It uses real chars instead of unicode list and works well.
You can use it like
"ąąą".removeAccents(); // returns "aaa"
You can easily convert this function to not be string prototype. However, as I'm fan of using string prototype in such cases, you'll have to do it yourself.
Not a single answer mentions String.localeCompare
, which happens to do exactly what you originally wanted, but not what you're asking for.
var list = ['a', 'b', 'c', 'o', 'u', 'z', 'ä', 'ö', 'ü']; list.sort((a, b) => a.localeCompare(b)); console.log(list); //Outputs ['a', 'ä', 'b', 'c', 'o', 'ö', 'u', 'ü', 'z']
The second and third parameter are not supported by older browsers though. It's an option worth considering nonetheless.
If you want to achieve sorting where "ä" comes after "a" and is not treated as the same, then you can use a function like mine.
You can always change the alphabet to get different or even weird sortings. However, if you want some letters to be equivalent, then you have to manipulate the strings like a = a.replace(/ä/, 'a')
or similar, as many have already replied above. I've included the uppercase letters if someone wants to have all uppercase words before all lowercase words (then you have to ommit .toLowerCase()
).
function sortbyalphabet(a,b) { alphabet = "0123456789AaÀàÁáÂâÃãÄäBbCcÇçDdÈèÉéÊêËëFfGgHhÌìÍíÎîÏïJjKkLlMmNnÑñOoÒòÓóÔôÕõÖöPpQqRrSsTtÙùÚúÛûÜüVvWwXxÝýŸÿZz"; a = a.toLowerCase(); b = b.toLowerCase(); shorterone = (a.length > b.length ? a : b); for (i=0; i<shorterone.length; i++){ diff = alphabet.indexOf(a.charAt(i)) - alphabet.indexOf(b.charAt(i)); if (diff!=0){ return diff; } } // sort the shorter first return a.length - b.length; } var n = ["ast", "Äste", "apfel", "äpfel", "à"]; console.log(n.sort(sortbyalphabet)); // should return ["apfel", "ast", "à", "äpfel", "äste"]
A simple and easy way:
function remove-accents(p){ c='áàãâäéèêëíìîïóòõôöúùûüçÁÀÃÂÄÉÈÊËÍÌÎÏÓÒÕÖÔÚÙÛÜÇ';s='aaaaaeeeeiiiiooooouuuucAAAAAEEEEIIIIOOOOOUUUUC';n='';for(i=0;i<p.length;i++){if(c.search(p.substr(i,1))>=0){n+=s.substr(c.search(p.substr(i,1)),1);} else{n+=p.substr(i,1);}} return n; }
So do this:
remove-accents("Thís ís ân accêntéd phráse");
输出:
"This is an accented phrase"
I've solved it another way, if you like.
Here I used two arrays where searchChars containing which will be replaced and replaceChars containing desired characters.
var text = "your input string"; var searchChars = ['Å','Ä','å','Ö','ö']; // add more charecter. var replaceChars = ['A','A','a','O','o']; // exact same index to searchChars. var index; for (var i = 0; i < text.length; i++) { if( $.inArray(text[i], searchChars) >-1 ){ // $.inArray() is from jquery. index = searchChars.indexOf(text[i]); text = text.slice(0, i) + replaceChars[index] + text.slice(i+1,text.length); } }