HTML Sanitizer

 * Script for sanitizing HTML input to only allow what is in the whitelist.
 * Tested against majority of the hacks listed at
 * @author Cameron Zemek <>
 * @license MIT License

Fix whitespace

for file in `find -type f | grep -v "\.svn" | grep -E "($EXTENSIONS)$"`
    # Send the commands H and w to ed
    # ed will append newline if the file does not end in one
    printf "%s\n" H w | ed -s $file
    # Strip trailing whitespace
    sed -i 's/[ \t]*$//g' $file
    # Convert tabs to 4 spaces
    sed -i -r "s/\t/    /g" $file

Google Search Screen Scrapper

#!/usr/bin/env python
import urllib2
from urllib import urlencode
from urlparse import urlparse
from BeautifulSoup import BeautifulSoup
def doGoogleSearch(query, limit=10):
    def _googleSearch(query, start, limit):
        urlParams = {'q' : query}
        if start > 0:
            urlParams['start'] = start
        url = "" + urlencode(urlParams)
        request = urllib2.Request(url)
        # Google blocks queries based on User Agent.


#!/usr/bin/env python
import sys
import getopt
def isHexDigit(c):
    return c.isdigit() or (c >= 'A' and c <= 'F') or (c >= 'a' and c <= 'f')
def isEntityStartCharacter(c):
    return c.isalpha() or c == '_' or c == ':'
def isEntityCharacter(c):
    return isEntityStartCharacter(c) or c.isdigit() or c == '.' or c == '-'
class ParseError(Exception):
    def __init__(self, msg):
        Exception.__init__(self, msg)
class TidyHtml:
    def init(self, html):
        self.input = html
        self.pos = 0
        self.tagStack = []
        self.beginText = False # Indic