htmlize.py

#!/usr/bin/env python
################################################################################
# File     : htmlize.py
# Function : Htmlize script/library for python
################################################################################
# Newest version can be obtained at http://www.freshlime.org
# Send comments or questions to code at freshlime dot org
# $Id: htmlize.py 99 2008-07-04 15:21:20Z jbester $
################################################################################
# Copyright (c) 2008, J. Bester
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * The name of the authors names of its contributors may be used to 
#       endorse or promote products derived from this software without
#       specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
################################################################################

"""
Htmlize python script/library

Roughly equivalent to htmlize in emacs this version only supports python.
Output uses the same css class names so a user can place their css from
a htmlize generated html and get their emacs look.
"""

import re
import os
import sys

HEADER = """
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<!-- Created by htmlize-1.34 in css mode. -->
<html>
  <head>
    <title>%s</title>
    <link rel="stylesheet" href="%s" type="text/css"/>
  </head>
  <body>
    <pre>
"""

FOOTER = """
    </pre>
  </body>
</html>
"""

# Regular expressions
COMMENT = re.compile( r"^[#][^\n]*" )
TOKEN = re.compile( r"^[a-zA-Z_][a-zA-Z0-9_]*([ \t]*[.][a-zA-Z_][a-zA-Z0-9_]*)*" )
WHITESPACE = re.compile( r"^[\s\n]+" )
STRINGS = re.compile( r'^[ur]?((["]["]["](.|\n)*?["]["]["])|([\'][\'][\'](.|\n)*?[\'][\'][\'])|([\'](([\\]\')|[^\'\n])*[\'])|(["](\\"|[^"\n])*?["]))')
ANNOTATION = re.compile( r'^@[^\n]' )
PUNCTUATION = re.compile( r'^([-!=+*/:%(){}><,.]|[0-9]|[\[\]|&^])+')

KEYWORDS = ['and', 'del', 'from', 'not', 'while', 'as', 'elif',
            'global', 'or', 'with', 'assert', 'else', 'if', 'pass',
            'yield', 'break', 'except', 'import', 'print', 'class',
            'exec', 'in', 'raise', 'continue', 'finally', 'is',
            'return', 'def', 'for', 'lambda', 'try']

PSEUDO_KEYWORDS = ['None', 'True', 'False', 'self' ]

URL = re.compile( r'(http(s)?://[^ <\n\r]+)' )

def span( classname, txt ):
    """
    Create a html/css span
    `    
    Return string CSS Span
    """
    return "<span class='%s'>%s</span>" % (classname, html_quote( txt ) )

def htmlize_urls( text ):
    """
    Covert Urls to links

    text - text to analyze
    returns text with html links embedded
    """

    match = URL.search( text )
    if match == None:
        return text
    else:
        # initialize accumulator
        acc = ''
        # initialize buffer
        input_buffer = text
        while match != None:
            # upto the match
            acc +=  input_buffer[ : match.start() ] 

            # convert match to html
            url = match.group( 0 )
            html = '<a href="%s">%s</a>' % ( url, url ) 
            acc += html 
            
            # prepare for next iteration
            input_buffer = input_buffer[ match.end() : ]
            match = URL.search( input_buffer )
        # append left overs if any
        acc += input_buffer 
        return acc
            

def html_quote( txt ):
    """
    Perform html quoting.

    Returns quote string
    """
    quote_map = { '&':'&amp;', '<':'&lt;', '>':'&gt;', '>':'&gt;',
               '"':'&quot;', "'":'&apos;' }
    res = ''
    for char in txt:
        if char in quote_map:
            res += quote_map[ char ]
        else:
            res += char
    return res

def annotate( buf ):
    """
    Annotate source file with how it should be syntax hilighting
    """
    output = []
    size = len( buf )
    while len( buf ) != 0:
        ####        
        # basically place annotated (first parameter of tuple) substrings
        # into the output list.  Annotations will be later used for
        # outputting the correct span tags.  If no annotation is used
        # they will not be annotated later on just outputted as plain text.

        # map annotation strings to regexps
        annotations = [ ('comment', COMMENT),
                        ('string', STRINGS),
                        ('token', TOKEN),
                        ('py-decorators', ANNOTATION),
                        (None, PUNCTUATION),
                        (None, WHITESPACE) ]
        for (ident, regex ) in annotations:
            match = regex.match( buf )
            # store match, using annotation id from regex 
            if match:
                sub = buf[:match.end()]
                buf = buf[match.end():]
                output.append( ( ident, sub ) )
                break
        # if an error has occured and we're stuck
        # i.e. no more matches occur
        if size == len( buf ):
            raise Exception( "Parse error" )
        size = len( buf )
    return output

def generate_html( outs, annotations, filename, css ):
    """
    Generate HTML from annotations

    outs - output stream
    annotations - annoation list
    filename - input filename
    css - css path
    """
    # alias write
    write = outs.write

    # write the header

    write( HEADER % ( filename, css ) )
    isdef = False
    ####
    # loop through annotations made earlier
    # output appropriate tag and quoted txt
    # only real special case is def causes next token to be a
    # function-name so handle that through the isdef variable
    for ( tag, txt ) in annotations:
        if tag == None:
            write( txt )
        elif tag == 'comment':
            write( htmlize_urls( span(tag, txt ) ) )
        elif tag == 'string' or tag == 'py-decorators':
            write( span(tag, txt ))
        elif tag == 'token':
            if txt in KEYWORDS:
                write( span( 'keyword', txt) )
                if txt == "def":
                    isdef = True
                    continue
            elif txt in PSEUDO_KEYWORDS:
                write( span( 'py-pseudo-keyword', txt ))
            elif txt in dir( __builtins__ ):
                write( span( 'py-builtins', txt ))
            elif isdef:
                write( span( 'function-name', txt ))
            else:
                write( html_quote( txt ) )
        if isdef:
            isdef = False
    # write the footer
    write( FOOTER )

def htmlize( inf, outf, css="css/python.css" ):
    """
    HTMLize a python file

    inf - input file name
    outf - output file name if none will become stdout
    css - css path name
    """
    # set up inputs and outputs
    if isinstance( outf, basestring ):
        outs = open( outf, 'w' )
    else:
        outs = outf
    inp = open( inf, 'r' )

    # map through each match and place it in the output list
    annotations = annotate( inp.read() )
    filename = os.path.split(inf)[1]
    generate_html( outs, annotations, filename, css )

def main(  ):
    """
    Entrypoint
    """
    args = sys.argv[ 1 : ]
    argc = len( args )
    # input only
    if argc == 1:
        htmlize( args[0], sys.stdout )
    elif argc == 2 or argc == 3:
        # input and css
        apply( htmlize, args )
    else:
        print """
        htmlize.py inputfile [outputfile] [cssfile]
        
        inputfile must be python
        outputfile is html (optional) defaults to stdout
        cssfile is referenced from the outputfile
        """

if __name__ == '__main__':
    main()