#!/usr/bin/env python # # gen_html: An HTML generation utility # # This utility is used to generate random HTML for the parser to # deal with. Flags for each of the components allow variations # on HTML, so as to spew either valid or invalid HTML. # from string import * from ek_sgmllib import tagfind from random import randint import re, sys attrname = re.compile('[a-zA-Z_][-.a-zA-Z_0-9]*') attrvalue = re.compile('[-a-zA-Z0-9@./:+*%?!&$\(\)_#=~]') quotedattrvalueset = digits + '#$%&()*+,-./:;=?@[\]^_`{|}~' attrvalueset = '-@./:+*%?!&$()_#=~*' + letters + digits DataString = ['In A.D. 2101', 'War was beginning.', 'What happen?', 'Somebody set up us the bomb', 'We get signal', 'What!', 'Main screen turn on', "It's You!!", 'How are you gentlemen!!', 'All your base are belong to us', 'You are on the way to destruction', 'What you say!!', 'You have no chance to survive make your time', 'HA HA HA HA ....', "Take off every 'zig'", 'You know what you doing', "Move 'zig'", 'For great justice'] DataWords = split(join(DataString, ' ')) def get_word(matcher, bogus=0): while 1: res = DataWords[randint(0, len(DataWords)-1)] if not bogus: z = matcher.match(res) if not z or z.end(0) != len(res): continue return res def get_tagname(bogus=0): return upper(get_word(tagfind, bogus)) def get_whitespace(): return join(map(lambda x: whitespace[randint(0,len(whitespace) - 1)], range(randint(0, 3))), '') def get_attrname(bogus=0): return get_word(attrname, bogus) def get_quoted_attrvalue(bogus=0): quoteidx = randint(0, 1) quote = ["'", '"'][quoteidx] otherquote = ["'", '"'][not quoteidx] res = quote for i in range(randint(0, 20)): if not randint(0, 5): res = res +quotedattrvalueset[randint(0,len(quotedattrvalueset)-1)] elif not randint(0, 5): res = res + otherquote elif not randint(0, 5): res = res + get_whitespace() else: res = res + get_attrname(0) if not bogus: res = res + quote else: res = res + [otherquote, '<', '>'][randint(0, 2)] return res def get_unquoted_attrvalue(bogus=0): res = '' for i in range(randint(not bogus, 20)): if not randint(0, 5): res = res + get_tagname(bogus) elif bogus and not randint(0, 5): res = res + get_whitespace() else: res = res + attrvalueset[randint(0, len(attrvalueset)-1)] return res def get_starttag(bogus=0): res = '<' if bogus and not randint(0, 10): res = res + whitespace[randint(0, len(whitespace)-1)] res = res + get_tagname(bogus) if not randint(0, 5): res = res + get_whitespace() for nattrs in range(randint(0, 4)): res = res + whitespace[randint(0, len(whitespace)-1)] res = res + get_whitespace() res = res + get_attrname(bogus) t = randint(0, 1) if t: res = res + get_whitespace() + '=' + get_whitespace() + \ get_quoted_attrvalue(randint(0, bogus)) else: res = res + get_whitespace() + '=' + get_whitespace() + \ get_unquoted_attrvalue(randint(0, bogus)) if bogus and not randint(0, 5): res = res + get_whitespace() + '>' else: res = res + '>' return res def get_comment(): res = '