''' ASCIIfy Kristjan Kannike Created: 2005-09-05 Changed: 2006-05-23 "ASCIIfy" a string. ''' from unicodedata import decomposition import re def asciify(s): '''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.''' temp = u'' for char in s: decomp = decomposition(char) if decomp: temp += unichr(int(decomp.split()[0], 16)) else: temp += char return temp def normalise(s): '''ASCIIfy a Unicode string, put it into lowercase, substitute '_' for ' ', and remove all non-alphanumeric characters.''' temp = asciify(s) temp = temp.lower() temp = re.sub(r'\s\s+', '', temp) # Remove excessive whitespace temp = temp.replace(' ', '_') temp = re.sub(r'\W', '', temp) # Remove all non-alphanumeric characters return temp