"""
Htmlize python script/library
Roughly equivalent to htmlize in emacs this version only supports python.
Output uses the same css class names so a user can place their css from
a htmlize generated html and get their emacs look.
"""
import re
import os
import sys
HEADER = """
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<!-- Created by htmlize-1.34 in css mode. -->
<html>
<head>
<title>%s</title>
<link rel="stylesheet" href="%s" type="text/css"/>
</head>
<body>
<pre>
"""
FOOTER = """
</pre>
</body>
</html>
"""
COMMENT = re.compile( r"^[#][^\n]*" )
TOKEN = re.compile( r"^[a-zA-Z_][a-zA-Z0-9_]*([ \t]*[.][a-zA-Z_][a-zA-Z0-9_]*)*" )
WHITESPACE = re.compile( r"^[\s\n]+" )
STRINGS = re.compile( r'^[ur]?((["]["]["](.|\n)*?["]["]["])|([\'][\'][\'](.|\n)*?[\'][\'][\'])|([\'](([\\]\')|[^\'\n])*[\'])|(["](\\"|[^"\n])*?["]))')
ANNOTATION = re.compile( r'^@[^\n]' )
PUNCTUATION = re.compile( r'^([-!=+*/:%(){}><,.]|[0-9]|[\[\]|&^])+')
KEYWORDS = ['and', 'del', 'from', 'not', 'while', 'as', 'elif',
'global', 'or', 'with', 'assert', 'else', 'if', 'pass',
'yield', 'break', 'except', 'import', 'print', 'class',
'exec', 'in', 'raise', 'continue', 'finally', 'is',
'return', 'def', 'for', 'lambda', 'try']
PSEUDO_KEYWORDS = ['None', 'True', 'False', 'self' ]
URL = re.compile( r'(http(s)?://[^ <\n\r]+)' )
def span( classname, txt ):
"""
Create a html/css span
`
Return string CSS Span
"""
return "<span class='%s'>%s</span>" % (classname, html_quote( txt ) )
def htmlize_urls( text ):
"""
Covert Urls to links
text - text to analyze
returns text with html links embedded
"""
match = URL.search( text )
if match == None:
return text
else:
acc = ''
input_buffer = text
while match != None:
acc += input_buffer[ : match.start() ]
url = match.group( 0 )
html = '<a href="%s">%s</a>' % ( url, url )
acc += html
input_buffer = input_buffer[ match.end() : ]
match = URL.search( input_buffer )
acc += input_buffer
return acc
def html_quote( txt ):
"""
Perform html quoting.
Returns quote string
"""
quote_map = { '&':'&', '<':'<', '>':'>', '>':'>',
'"':'"', "'":''' }
res = ''
for char in txt:
if char in quote_map:
res += quote_map[ char ]
else:
res += char
return res
def annotate( buf ):
"""
Annotate source file with how it should be syntax hilighting
"""
output = []
size = len( buf )
while len( buf ) != 0:
annotations = [ ('comment', COMMENT),
('string', STRINGS),
('token', TOKEN),
('py-decorators', ANNOTATION),
(None, PUNCTUATION),
(None, WHITESPACE) ]
for (ident, regex ) in annotations:
match = regex.match( buf )
if match:
sub = buf[:match.end()]
buf = buf[match.end():]
output.append( ( ident, sub ) )
break
if size == len( buf ):
raise Exception( "Parse error" )
size = len( buf )
return output
def generate_html( outs, annotations, filename, css ):
"""
Generate HTML from annotations
outs - output stream
annotations - annoation list
filename - input filename
css - css path
"""
write = outs.write
write( HEADER % ( filename, css ) )
isdef = False
for ( tag, txt ) in annotations:
if tag == None:
write( txt )
elif tag == 'comment':
write( htmlize_urls( span(tag, txt ) ) )
elif tag == 'string' or tag == 'py-decorators':
write( span(tag, txt ))
elif tag == 'token':
if txt in KEYWORDS:
write( span( 'keyword', txt) )
if txt == "def":
isdef = True
continue
elif txt in PSEUDO_KEYWORDS:
write( span( 'py-pseudo-keyword', txt ))
elif txt in dir( __builtins__ ):
write( span( 'py-builtins', txt ))
elif isdef:
write( span( 'function-name', txt ))
else:
write( html_quote( txt ) )
if isdef:
isdef = False
write( FOOTER )
def htmlize( inf, outf, css="css/python.css" ):
"""
HTMLize a python file
inf - input file name
outf - output file name if none will become stdout
css - css path name
"""
if isinstance( outf, basestring ):
outs = open( outf, 'w' )
else:
outs = outf
inp = open( inf, 'r' )
annotations = annotate( inp.read() )
filename = os.path.split(inf)[1]
generate_html( outs, annotations, filename, css )
def main( ):
"""
Entrypoint
"""
args = sys.argv[ 1 : ]
argc = len( args )
if argc == 1:
htmlize( args[0], sys.stdout )
elif argc == 2 or argc == 3:
apply( htmlize, args )
else:
print """
htmlize.py inputfile [outputfile] [cssfile]
inputfile must be python
outputfile is html (optional) defaults to stdout
cssfile is referenced from the outputfile
"""
if __name__ == '__main__':
main()