''' Bookmark processing Kristjan Kannike Created: 2005-06-04 Changed: 2009-06-30 Clean up Firefox bookmarks (remove non-standard attributes, esp. ICON), with the exception of FEEDURL, and make it into a well-formed XHTML file. Set correct heading levels. It can be used from the command line as python bookmarks.py bookmarks.html [cleanedBookmarksFilename.html] ''' import re from htmlentitydefs import name2codepoint from os.path import splitext def fromName2codepoint(string): '''String with (X)HTML entitites -> string with their Unicode counterparts. >>> fromName2codepoint('This & that.') 'This & that.' ''' for key, val in name2codepoint.iteritems(): string = string.replace('&%s;' % key, '&#%d;' % val) return string def cleanBookmarks(filename, newFilename = None): '''Firefox bookmarks -> cleaned up bookmarks in an XHTML file''' bm = file(filename, 'r') bmCont = bm.read() bm.close() # Correct heading levels bmContLines = bmCont.split('\n') headingLevel = 1 for i in range(0, len(bmContLines)): if '
' in bmContLines[i]: headingLevel += 1 if '
' in bmContLines[i]: headingLevel -= 1 bmContLines[i] = \ bmContLines[i].replace('': '', '': '', 'Bookmarks': '', ' >': '>', '': '', '': '', '

': '', '

': '', 'updated null': '', 'error null': '', '

': '', '
': '', '': '

', 'HREF': 'href', 'FEEDURL': 'feedurl', '
': '
', ' -> ': ' -> ' } for sub, repl in bmSubs.iteritems(): bmCont = bmCont.replace(sub, repl) bmCont = fromName2codepoint(bmCont) # Cleanup substitution patterns for regexes reSubsInBM = { '
.*?\n': '', ' ADD_DATE="[^"]*"': '', ' LAST_MODIFIED="[^"]*"': '', 'PERSONAL_TOOLBAR_FOLDER="true"': '', ' ID="[^"]*"': '', ' LAST_VISIT="[^"]*"': '', 'ICON="[^"]*"': '', 'ICON_URI="[^"]*"': '', ' LAST_CHARSET="[^"]*"': '', '': '', # Esp. the JavaScript shell '': '', r'(no-)?(updated|error) \[.*?\]': '', # From Sage the Firefox RSS reader '&(?!#)': '&', # Unencoded &'s '

Bookmarks ''' after = ''' ''' bmCont = before + bmCont + after if newFilename is not None: bmNew = file(newFilename, 'w') else: (root, ext) = splitext(filename) bmNew = file(root+'_CLEANED'+ext, 'w') bmNew.write(bmCont) bmNew.close() def main(argv): def usage(): print 'Convert your Firefox bookmarks into clean XHTML' print 'Usage:' print '> python bookmarks.py bookmarks.html' print 'or' print '> python bookmarks.py bookmarks.html cleanedBookmarks.html' try: bmName = argv[0] if len(argv) == 2: newBmName = argv[1] else: newBmName = None cleanBookmarks(bmName, newBmName) except Exception, e: print e usage() # For command line use if __name__ == '__main__': import sys main(sys.argv[1:])