'''
Bookmark processing
Kristjan Kannike
Created: 2005-06-04
Changed: 2009-06-30
Clean up Firefox bookmarks (remove non-standard attributes, esp. ICON), with
the exception of FEEDURL, and make it into a well-formed XHTML file.
Set correct heading levels.
It can be used from the command line as
python bookmarks.py bookmarks.html [cleanedBookmarksFilename.html]
'''
import re
from htmlentitydefs import name2codepoint
from os.path import splitext
def fromName2codepoint(string):
'''String with (X)HTML entitites -> string with their Unicode counterparts.
>>> fromName2codepoint('This & that.')
'This & that.'
'''
for key, val in name2codepoint.iteritems():
string = string.replace('&%s;' % key, '%d;' % val)
return string
def cleanBookmarks(filename, newFilename = None):
'''Firefox bookmarks -> cleaned up bookmarks in an XHTML file'''
bm = file(filename, 'r')
bmCont = bm.read()
bm.close()
# Correct heading levels
bmContLines = bmCont.split('\n')
headingLevel = 1
for i in range(0, len(bmContLines)):
if '
' in bmContLines[i]:
headingLevel += 1
if '
' in bmContLines[i]:
headingLevel -= 1
bmContLines[i] = \
bmContLines[i].replace('': '',
'': '',
'Bookmarks': '',
' >': '>',
'': '',
'': '',
'
': '',
'
': '',
'updated null': '',
'error null': '',
'
': '',
'': '',
'': '',
'HREF': 'href',
'FEEDURL': 'feedurl',
'
': '
',
' -> ': ' -> '
}
for sub, repl in bmSubs.iteritems():
bmCont = bmCont.replace(sub, repl)
bmCont = fromName2codepoint(bmCont)
# Cleanup substitution patterns for regexes
reSubsInBM = {
'.*?\n': '',
' ADD_DATE="[^"]*"': '',
' LAST_MODIFIED="[^"]*"': '',
'PERSONAL_TOOLBAR_FOLDER="true"': '',
' ID="[^"]*"': '',
' LAST_VISIT="[^"]*"': '',
'ICON="[^"]*"': '',
'ICON_URI="[^"]*"': '',
' LAST_CHARSET="[^"]*"': '',
'': '', # Esp. the JavaScript shell
'': '',
r'(no-)?(updated|error) \[.*?\]': '', # From Sage the Firefox RSS reader
'&(?!#)': '&', # Unencoded &'s
'
Bookmarks
'''
after = '''
'''
bmCont = before + bmCont + after
if newFilename is not None:
bmNew = file(newFilename, 'w')
else:
(root, ext) = splitext(filename)
bmNew = file(root+'_CLEANED'+ext, 'w')
bmNew.write(bmCont)
bmNew.close()
def main(argv):
def usage():
print 'Convert your Firefox bookmarks into clean XHTML'
print 'Usage:'
print '> python bookmarks.py bookmarks.html'
print 'or'
print '> python bookmarks.py bookmarks.html cleanedBookmarks.html'
try:
bmName = argv[0]
if len(argv) == 2:
newBmName = argv[1]
else:
newBmName = None
cleanBookmarks(bmName, newBmName)
except Exception, e:
print e
usage()
# For command line use
if __name__ == '__main__':
import sys
main(sys.argv[1:])