"""HTML form handling for web clients.
ClientForm is a Python module for handling HTML forms on the client
side, useful for parsing HTML forms, filling them in and returning the
completed forms to the server. It has developed from a port of Gisle
Aas' Perl module HTML::Form, from the libwww-perl library, but the
interface is not the same.
The most useful docstring is the one for HTMLForm.
RFC 1866: HTML 2.0
RFC 1867: Form-based File Upload in HTML
RFC 2388: Returning Values from Forms: multipart/form-data
HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
HTML 4.01 Specification, W3C Recommendation 24 December 1999
Copyright 2002-2003 John J. Lee
Copyright 1998-2000 Gisle Aas.
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file COPYING included with
the distribution).
"""
# XXX
# Treat unknown controls as text controls? (this was a recent LWP
# HTML::Form change) I guess this is INPUT with no TYPE? Check LWP
# source and browser behaviour.
# Support for list item ids. How to handle missing ids? (How do I deal
# with duplicate OPTION labels ATM? Can't remember...)
# Arrange things so can automatically PyPI-register with categories
# without messing up 1.5.2 compatibility.
# Tests need work.
# Test single and multiple file upload some more on the web.
# Does file upload work when name is missing? Sourceforge tracker form
# doesn't like it. Check standards, and test with Apache. Test binary
# upload with Apache.
# Add label support for CHECKBOX and RADIO.
# Better docs.
# Deal with character sets properly. Not sure what the issues are here.
# I don't *think* any encoding of control names, filenames or data is
# necessary -- HTML spec. doesn't require it, and Mozilla Firebird 0.6
# doesn't seem to do it.
# Add charset parameter to Content-type headers? How to find value??
# Get rid of MapBase, AList and MimeWriter.
# I'm not going to fix this unless somebody tells me what real servers
# that want this encoding actually expect: If enctype is
# application/x-www-form-urlencoded and there's a FILE control present.
# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
# 17.13.2), but I send "name=" ATM. What about multiple file upload??
# Get rid of the two type-switches (for kind and click*).
# Remove single-selection code: can be special case of multi-selection,
# with a few variations, I think.
# Factor out multiple-selection list code? May not be easy. Maybe like
# this:
# ListControl
# ^
# | MultipleListControlMixin
# | ^
# SelectControl /
# ^ /
# \ /
# MultiSelectControl
# Plan
# ----
# Maybe a 0.2.x, cleaned up a bit and with id support for list items?
# Not sure it's worth it, really.
# Remove toggle methods.
# Replace by_label with choice between value / id / label /
# element contents (see discussion with Gisle about labels on
# libwww-perl list).
# ...what else?
# Work on DOMForm.
# XForms? Don't know if there's a need here.
try: True
except NameError:
True = 1
False = 0
try: bool
except NameError:
def bool(expr):
if expr: return True
else: return False
import sys, urllib, urllib2, types, string, mimetools, copy
from urlparse import urljoin
from cStringIO import StringIO
try:
import UnicodeType
except ImportError:
UNICODE = False
else:
UNICODE = True
VERSION = "0.1.13"
CHUNK = 1024 # size of chunks fed to parser, in bytes
# This version of urlencode is from my Python 1.5.2 back-port of the
# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
# of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
def urlencode(query,doseq=False,):
"""Encode a sequence of two-element tuples or dictionary into a URL query \
string.
If any values in the query arg are sequences and doseq is true, each
sequence element is converted to a separate parameter.
If the query arg is a sequence of two-element tuples, the order of the
parameters in the output will match the order of parameters in the
input.
"""
if hasattr(query,"items"):
# mapping objects
query = query.items()
else:
# it's a bother at times that strings and string-like objects are
# sequences...
try:
# non-sequence items should not work with len()
x = len(query)
# non-empty strings will fail this
if len(query) and type(query[0]) != types.TupleType:
raise TypeError()
# zero-length sequences of all types will get here and succeed,
# but that's a minor nit - since the original implementation
# allowed empty dicts that type of behavior probably should be
# preserved for consistency
except TypeError:
ty,va,tb = sys.exc_info()
raise TypeError("not a valid non-string sequence or mapping "
"object", tb)
l = []
if not doseq:
# preserve old behavior
for k, v in query:
k = urllib.quote_plus(str(k))
v = urllib.quote_plus(str(v))
l.append(k + '=' + v)
else:
for k, v in query:
k = urllib.quote_plus(str(k))
if type(v) == types.StringType:
v = urllib.quote_plus(v)
l.append(k + '=' + v)
elif UNICODE and type(v) == types.UnicodeType:
# is there a reasonable way to convert to ASCII?
# encode generates a string, but "replace" or "ignore"
# lose information and "strict" can raise UnicodeError
v = urllib.quote_plus(v.encode("ASCII","replace"))
l.append(k + '=' + v)
else:
try:
# is this a sufficient test for sequence-ness?
x = len(v)
except TypeError:
# not a sequence
v = urllib.quote_plus(str(v))
l.append(k + '=' + v)
else:
# loop over the sequence
for elt in v:
l.append(k + '=' + urllib.quote_plus(str(elt)))
return string.join(l, '&')
def startswith(string, initial):
if len(initial) > len(string): return False
return string[:len(initial)] == initial
def issequence(x):
try:
x[0]
except (TypeError, KeyError):
return False
except IndexError:
pass
return True
def isstringlike(x):
try: x+""
except: return False
else: return True
# XXX don't really want to drag this along (MapBase, AList, MimeWriter)
class MapBase:
"""Mapping designed to be easily derived from.
Subclass it and override __init__, __setitem__, __getitem__, __delitem__
and keys. Nothing else should need to be overridden, unlike UserDict.
This significantly simplifies dictionary-like classes.
Also different from UserDict in that it has a redonly flag, and can be
updated (and initialised) with a sequence of pairs (key, value).
"""
def __init__(self, init=None):
self._data = {}
self.readonly = False
if init is not None: self.update(init)
def __getitem__(self, key):
return self._data[key]
def __setitem__(self, key, item):
if not self.readonly:
self._data[key] = item
else:
raise TypeError("object doesn't support item assignment")
def __delitem__(self, key):
if not self.readonly:
del self._data[key]
else:
raise TypeError("object doesn't support item deletion")
def keys(self):
return self._data.keys()
# now the internal workings, there should be no need to override these:
def clear(self):
for k in self.keys():
del self[k]
def __repr__(self):
rep = []
for k, v in self.items():
rep.append("%s: %s" % (repr(k), repr(v)))
return self.__class__.__name__+"{"+(string.join(rep, ", "))+"}"
def copy(self):
return copy.copy(self)
def __cmp__(self, dict):
# note: return value is *not* boolean
for k, v in self.items():
if not (dict.has_key(k) and dict[k] == v):
return 1 # different
return 0 # the same
def __len__(self):
return len(self.keys())
def values(self):
r = []
for k in self.keys():
r.append(self[k])
return r
def items(self):
keys = self.keys()
vals = self.values()
r = []
for i in len(self):
r.append((keys[i], vals[i]))
return r
def has_key(self, key):
return key in self.keys()
def update(self, map):
if issequence(map) and not isstringlike(map):
items = map
else:
items = map.items()
for tup in items:
if not isinstance(tup, TupleType):
raise TypeError(
"MapBase.update requires a map or a sequence of pairs")
k, v = tup
self[k] = v
def get(self, key, failobj=None):
if key in self.keys():
return self[key]
else:
return failobj
def setdefault(self, key, failobj=None):
if not self.has_key(key):
self[key] = failobj
return self[key]
class AList(MapBase):
"""Read-only ordered mapping."""
def __init__(self, seq=[]):
self.readonly = True
self._inverted = False
self._data = list(seq[:])
self._keys = []
self._values = []
for key, value in seq:
self._keys.append(key)
self._values.append(value)
def set_inverted(self, inverted):
if (inverted and not self._inverted) or (
not inverted and self._inverted):
self._keys, self._values = self._values, self._keys
if inverted: self._inverted = True
else: self._inverted = False
def __getitem__(self, key):
try:
i = self._keys.index(key)
except ValueError:
raise KeyError(key)
return self._values[i]
def __delitem__(self, key):
try:
i = self._keys.index[key]
except ValueError:
raise KeyError(key)
del self._values[i]
def keys(self): return list(self._keys[:])
def values(self): return list(self._values[:])
def items(self):
data = self._data[:]
if not self._inverted:
return data
else:
newdata = []
for k, v in data:
newdata.append((v, k))
return newdata
# This cut-n-pasted MimeWriter from standard library is here so can add
# to HTTP headers rather than message body when appropriate. It also uses
# \r\n in place of \n. This is nasty.
class MimeWriter:
"""Generic MIME writer.
Methods:
__init__()
addheader()
flushheaders()
startbody()
startmultipartbody()
nextpart()
lastpart()
A MIME writer is much more primitive than a MIME parser. It
doesn't seek around on the output file, and it doesn't use large
amounts of buffer space, so you have to write the parts in the
order they should occur on the output file. It does buffer the
headers you add, allowing you to rearrange their order.
General usage is:
f =
w = MimeWriter(f)
...call w.addheader(key, value) 0 or more times...
followed by either:
f = w.startbody(content_type)
...call f.write(data) for body data...
or:
w.startmultipartbody(subtype)
for each part:
subwriter = w.nextpart()
...use the subwriter's methods to create the subpart...
w.lastpart()
The subwriter is another MimeWriter instance, and should be
treated in the same way as the toplevel MimeWriter. This way,
writing recursive body parts is easy.
Warning: don't forget to call lastpart()!
XXX There should be more state so calls made in the wrong order
are detected.
Some special cases:
- startbody() just returns the file passed to the constructor;
but don't use this knowledge, as it may be changed.
- startmultipartbody() actually returns a file as well;
this can be used to write the initial 'if you can read this your
mailer is not MIME-aware' message.
- If you call flushheaders(), the headers accumulated so far are
written out (and forgotten); this is useful if you don't need a
body part at all, e.g. for a subpart of type message/rfc822
that's (mis)used to store some header-like information.
- Passing a keyword argument 'prefix=' to addheader(),
start*body() affects where the header is inserted; 0 means
append at the end, 1 means insert at the start; default is
append for addheader(), but insert for start*body(), which use
it to determine where the Content-type header goes.
"""
def __init__(self, fp, http_hdrs=None):
self._http_hdrs = http_hdrs
self._fp = fp
self._headers = []
self._boundary = []
self._first_part = True
def addheader(self, key, value, prefix=0,
add_to_http_hdrs=0):
"""
prefix is ignored if add_to_http_hdrs is true.
"""
lines = string.split(value, "\r\n")
while lines and not lines[-1]: del lines[-1]
while lines and not lines[0]: del lines[0]
if add_to_http_hdrs:
value = string.join(lines, "")
self._http_hdrs.append((key, value))
else:
for i in range(1, len(lines)):
lines[i] = " " + string.strip(lines[i])
value = string.join(lines, "\r\n") + "\r\n"
line = key + ": " + value
if prefix:
self._headers.insert(0, line)
else:
self._headers.append(line)
def flushheaders(self):
self._fp.writelines(self._headers)
self._headers = []
def startbody(self, ctype=None, plist=[], prefix=1,
add_to_http_hdrs=0, content_type=1):
"""
prefix is ignored if add_to_http_hdrs is true.
"""
if content_type and ctype:
for name, value in plist:
ctype = ctype + ';\r\n %s=\"%s\"' % (name, value)
self.addheader("Content-type", ctype, prefix=prefix,
add_to_http_hdrs=add_to_http_hdrs)
self.flushheaders()
if not add_to_http_hdrs: self._fp.write("\r\n")
self._first_part = True
return self._fp
def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
add_to_http_hdrs=0, content_type=1):
boundary = boundary or mimetools.choose_boundary()
self._boundary.append(boundary)
return self.startbody("multipart/" + subtype,
[("boundary", boundary)] + plist,
prefix=prefix,
add_to_http_hdrs=add_to_http_hdrs,
content_type=content_type)
def nextpart(self):
boundary = self._boundary[-1]
if self._first_part:
self._first_part = False
else:
self._fp.write("\r\n")
self._fp.write("--" + boundary + "\r\n")
return self.__class__(self._fp)
def lastpart(self):
if self._first_part:
self.nextpart()
boundary = self._boundary.pop()
self._fp.write("\r\n--" + boundary + "--\r\n")
class ControlNotFoundError(ValueError): pass
class ItemNotFoundError(ValueError): pass
class ItemCountError(ValueError): pass
class ParseError(Exception): pass
def ParseResponse(response, select_default=False, ignore_errors=False):
"""Parse HTTP response and return a list of HTMLForm instances.
The return value of urllib2.urlopen can be conveniently passed to this
function as the response parameter.
ClientForm.ParseError is raised on parse errors.
response: file-like object (supporting read() method) with a method
geturl(), returning the base URI of the HTTP response
select_default: for multiple-selection SELECT controls and RADIO controls,
pick the first item as the default if none are selected in the HTML
ignore_errors: don't raise ParseError, and carry on regardless if the
parser gets confused
Pass a true value for select_default if you want the behaviour specified by
RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
RADIO or multiple-selection SELECT control if none were selected in the
HTML. Most browsers (including Microsoft Internet Explorer (IE) and
Netscape Navigator) instead leave all items unselected in these cases. The
W3C HTML 4.0 standard leaves this behaviour undefined in the case of
multiple-selection SELECT controls, but insists that at least one RADIO
button should be checked at all times, in contradiction to browser
behaviour.
Precisely what ignore_errors does isn't well-defined yet, so don't rely too
much on the current behaviour -- if you want robustness, you're better off
fixing the HTML before passing it to this function.
"""
return ParseFile(response, response.geturl(), select_default)
def ParseFile(file, base_uri, select_default=False, ignore_errors=False):
"""Parse HTML and return a list of HTMLForm instances.
ClientForm.ParseError is raised on parse errors.
file: file-like object (supporting read() method) containing HTML with zero
or more forms to be parsed
base_uri: the base URI of the document
For the other arguments and further details, see ParseResponse.__doc__.
"""
fp = _FORM_PARSER_CLASS(ignore_errors)
while 1:
data = file.read(CHUNK)
fp.feed(data)
if len(data) != CHUNK: break
forms = []
for (name, action, method, enctype), attrs, controls in fp.forms:
if action is None:
action = base_uri
else:
action = urljoin(base_uri, action)
form = HTMLForm(action, method, enctype, name, attrs)
for type, name, attr in controls:
form.new_control(type, name, attr, select_default=select_default)
forms.append(form)
for form in forms:
form.fixup()
return forms
class _AbstractFormParser:
"""forms attribute contains HTMLForm instances on completion."""
# pinched (and modified) from Moshe Zadka
def __init__(self, ignore_errors, entitydefs=None):
if entitydefs is not None:
self.entitydefs = entitydefs
self._ignore_errors = ignore_errors
self.forms = []
self._current_form = None
self._select = None
self._optgroup = None
self._option = None
self._textarea = None
def error(self, error):
if not self._ignore_errors: raise error
def start_form(self, attrs):
if self._current_form is not None:
self.error(ParseError("nested FORMs"))
name = None
action = None
enctype = "application/x-www-form-urlencoded"
method = "GET"
d = {}
for key, value in attrs:
if key == "name":
name = value
elif key == "action":
action = value
elif key == "method":
method = string.upper(value)
elif key == "enctype":
enctype = string.lower(value)
else:
d[key] = value
controls = []
self._current_form = (name, action, method, enctype), d, controls
def end_form(self):
if self._current_form is None:
self.error(ParseError("end of FORM before start"))
self.forms.append(self._current_form)
self._current_form = None
def start_select(self, attrs):
if self._current_form is None:
self.error(ParseError("start of SELECT before start of FORM"))
if self._select is not None:
self.error(ParseError("nested SELECTs"))
if self._textarea is not None:
self.error(ParseError("SELECT inside TEXTAREA"))
d = {}
for key, val in attrs:
d[key] = val
self._select = d
self._append_select_control({"__select": d})
def end_select(self):
if self._current_form is None:
self.error(ParseError("end of SELECT before start of FORM"))
if self._select is None:
self.error(ParseError("end of SELECT before start"))
if self._option is not None:
self._end_option()
self._select = None
def start_optgroup(self, attrs):
if self._select is None:
self.error(ParseError("OPTGROUP outside of SELECT"))
d = {}
for key, val in attrs:
d[key] = val
self._optgroup = d
def end_optgroup(self):
if self._optgroup is None:
self.error(ParseError("end of OPTGROUP before start"))
self._optgroup = None
def _start_option(self, attrs):
if self._select is None:
self.error(ParseError("OPTION outside of SELECT"))
if self._option is not None:
self._end_option()
d = {}
for key, val in attrs:
d[key] = val
self._option = {}
self._option.update(d)
if (self._optgroup and self._optgroup.has_key("disabled") and
not self._option.has_key("disabled")):
self._option["disabled"] = None
def _end_option(self):
if self._option is None:
self.error(ParseError("end of OPTION before start"))
contents = string.strip(self._option.get("contents", ""))
#contents = string.strip(self._option["contents"])
self._option["contents"] = contents
if not self._option.has_key("value"):
self._option["value"] = contents
if not self._option.has_key("label"):
self._option["label"] = contents
# stuff dict of SELECT HTML attrs into a special private key
# (gets deleted again later)
self._option["__select"] = self._select
self._append_select_control(self._option)
self._option = None
def _append_select_control(self, attrs):
controls = self._current_form[2]
name = self._select.get("name")
controls.append(("select", name, attrs))
## def do_option(self, attrs):
## if self._select is None:
## self.error(ParseError("OPTION outside of SELECT"))
## d = {}
## for key, val in attrs:
## d[key] = val
## self._option = {}
## self._option.update(d)
## if (self._optgroup and self._optgroup.has_key("disabled") and
## not self._option.has_key("disabled")):
## self._option["disabled"] = None
def start_textarea(self, attrs):
if self._current_form is None:
self.error(ParseError("start of TEXTAREA before start of FORM"))
if self._textarea is not None:
self.error(ParseError("nested TEXTAREAs"))
if self._select is not None:
self.error(ParseError("TEXTAREA inside SELECT"))
d = {}
for key, val in attrs:
d[key] = val
self._textarea = d
def end_textarea(self):
if self._current_form is None:
self.error(ParseError("end of TEXTAREA before start of FORM"))
if self._textarea is None:
self.error(ParseError("end of TEXTAREA before start"))
controls = self._current_form[2]
name = self._textarea.get("name")
controls.append(("textarea", name, self._textarea))
self._textarea = None
def handle_data(self, data):
if self._option is not None:
# self._option is a dictionary of the OPTION element's HTML
# attributes, but it has two special keys, one of which is the
# special "contents" key contains text between OPTION tags (the
# other is the "__select" key: see the end_option method)
map = self._option
key = "contents"
elif self._textarea is not None:
map = self._textarea
key = "value"
else:
return
if not map.has_key(key):
map[key] = data
else:
map[key] = map[key] + data
## def handle_data(self, data):
## if self._option is not None:
## contents = string.strip(data)
## controls = self._current_form[2]
## if not self._option.has_key("value"):
## self._option["value"] = contents
## if not self._option.has_key("label"):
## self._option["label"] = contents
## # self._option is a dictionary of the OPTION element's HTML
## # attributes, but it has two special keys:
## # 1. special "contents" key contains text between OPTION tags
## self._option["contents"] = contents
## # 2. stuff dict of SELECT HTML attrs into a special private key
## # (gets deleted again later)
## self._option["__select"] = self._select
## self._append_select_control(self._option)
## self._option = None
## elif self._textarea is not None:
## #self._textarea["value"] = data
## if self._textarea.get("value") is None:
## self._textarea["value"] = data
## else:
## self._textarea["value"] = self._textarea["value"] + data
def do_button(self, attrs):
if self._current_form is None:
self.error(ParseError("start of BUTTON before start of FORM"))
d = {}
d["type"] = "submit" # default
for key, val in attrs:
d[key] = val
controls = self._current_form[2]
type = d["type"]
name = d.get("name")
# we don't want to lose information, so use a type string that
# doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
# eg. type for BUTTON/RESET is "resetbutton"
# (type for INPUT/RESET is "reset")
type = type+"button"
controls.append((type, name, d))
def do_input(self, attrs):
if self._current_form is None:
self.error(ParseError("start of INPUT before start of FORM"))
d = {}
d["type"] = "text" # default
for key, val in attrs:
d[key] = val
controls = self._current_form[2]
type = d["type"]
name = d.get("name")
controls.append((type, name, d))
def do_isindex(self, attrs):
if self._current_form is None:
self.error(ParseError("start of ISINDEX before start of FORM"))
d = {}
for key, val in attrs:
d[key] = val
controls = self._current_form[2]
# isindex doesn't have type or name HTML attributes
controls.append(("isindex", None, d))
# use HTMLParser if we have it (it does XHTML), htmllib otherwise
try:
import HTMLParser
except ImportError:
import htmllib, formatter
class _FormParser(_AbstractFormParser, htmllib.HTMLParser):
# This is still here for compatibility with Python 1.5.2.
# It doesn't do the right thing with XHTML.
def __init__(self, ignore_errors, entitydefs=None):
htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
_AbstractFormParser.__init__(self, ignore_errors, entitydefs)
def do_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
_FORM_PARSER_CLASS = _FormParser
else:
class _XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
# thanks to Michael Howitz for this!
def __init__(self, ignore_errors, entitydefs=None):
HTMLParser.HTMLParser.__init__(self)
_AbstractFormParser.__init__(self, ignore_errors, entitydefs)
def start_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
def end_option(self):
_AbstractFormParser._end_option(self)
def handle_starttag(self, tag, attrs):
try:
method = getattr(self, 'start_' + tag)
except AttributeError:
try:
method = getattr(self, 'do_' + tag)
except AttributeError:
pass # unknown tag
else:
method(attrs)
else:
method(attrs)
def handle_endtag(self, tag):
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
pass # unknown tag
else:
method()
# handle_charref, handle_entityref and default entitydefs are taken
# from sgmllib
def handle_charref(self, name):
try:
n = int(name)
except ValueError:
self.unknown_charref(name)
return
if not 0 <= n <= 255:
self.unknown_charref(name)
return
self.handle_data(chr(n))
# Definition of entities -- derived classes may override
entitydefs = \
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
def handle_entityref(self, name):
table = self.entitydefs
if name in table:
self.handle_data(table[name])
else:
self.unknown_entityref(name)
return
# These methods would have passed through the ref intact if I'd thought
# of it earlier, but since the old parser silently swallows unknown
# refs, so does this new parser.
def unknown_entityref(self, ref): pass
def unknown_charref(self, ref): pass
_FORM_PARSER_CLASS = _XHTMLCompatibleFormParser
class Control:
"""An HTML form control.
An HTMLForm contains a sequence of Controls. HTMLForm delegates lots of
things to Control objects, and most of Control's methods are, in effect,
documented by the HTMLForm docstrings.
The Controls in an HTMLForm can be got at via the HTMLForm.find_control
method or the HTMLForm.controls attribute.
Control instances are usually constructed using the ParseFile /
ParseResponse functions, so you can probably ignore the rest of this
paragraph. A Control is only properly initialised after the fixup method
has been called. In fact, this is only strictly necessary for ListControl
instances. This is necessary because ListControls are built up from
ListControls each containing only a single item, and their initial value(s)
can only be known after the sequence is complete.
The types and values that are acceptable for assignment to the value
attribute are defined by subclasses.
If the disabled attribute is true, this represents the state typically
represented by browsers by `greying out' a control. If the disabled
attribute is true, the Control will raise AttributeError if an attempt is
made to change its value. In addition, the control will not be considered
`successful' as defined by the W3C HTML 4 standard -- ie. it will
contribute no data to the return value of the HTMLForm.click* methods. To
enable a control, set the disabled attribute to a false value.
If the readonly attribute is true, the Control will raise AttributeError if
an attempt is made to change its value. To make a control writable, set
the readonly attribute to a false value.
All controls have the disabled and readonly attributes, not only those that
may have the HTML attributes of the same names.
On assignment to the value attribute, the following exceptions are raised:
TypeError, AttributeError (if the value attribute should not be assigned
to, because the control is disabled, for example) and ValueError.
If the name or value attributes are None, or the value is an empty list, or
if the control is disabled, the control is not successful.
Public attributes:
type: string describing type of control (see the keys of the
HTMLForm.type2class dictionary for the allowable values) (readonly)
name: name of control (readonly)
value: current value of control (subclasses may allow a single value, a
sequence of values, or either)
disabled: disabled state
readonly: readonly state
id: value of id HTML attribute
"""
def __init__(self, type, name, attrs):
"""
type: string describing type of control (see the keys of the
HTMLForm.type2class dictionary for the allowable values)
name: control name
attrs: HTML attributes of control's HTML element
"""
raise NotImplementedError()
def add_to_form(self, form):
form.controls.append(self)
def fixup(self):
pass
def __getattr__(self, name): raise NotImplementedError()
def __setattr__(self, name, value): raise NotImplementedError()
def pairs(self):
"""Return list of (key, value) pairs suitable for passing to urlencode.
"""
raise NotImplementedError()
def _write_mime_data(self, mw):
"""Write data for this control to a MimeWriter."""
# called by HTMLForm
for name, value in self.pairs():
mw2 = mw.nextpart()
mw2.addheader("Content-disposition",
'form-data; name="%s"' % name, 1)
f = mw2.startbody(prefix=0)
f.write(value)
def __str__(self):
raise NotImplementedError()
#---------------------------------------------------
class ScalarControl(Control):
"""Control whose value is not restricted to one of a prescribed set.
Some ScalarControls don't accept any value attribute. Otherwise, takes a
single value, which must be string-like.
Additional read-only public attribute:
attrs: dictionary mapping the names of original HTML attributes of the
control to their values
"""
def __init__(self, type, name, attrs):
self.__dict__["type"] = string.lower(type)
self.__dict__["name"] = name
self._value = attrs.get("value")
self.disabled = attrs.has_key("disabled")
self.readonly = attrs.has_key("readonly")
self.id = attrs.get("id")
self.attrs = attrs.copy()
self._clicked = False
def __getattr__(self, name):
if name == "value":
return self.__dict__["_value"]
else:
raise AttributeError("%s instance has no attribute '%s'" %
(self.__class__.__name__, name))
def __setattr__(self, name, value):
if name == "value":
if not isstringlike(value):
raise TypeError("must assign a string")
elif self.readonly:
raise AttributeError("control '%s' is readonly" % self.name)
elif self.disabled:
raise AttributeError("control '%s' is disabled" % self.name)
self.__dict__["_value"] = value
elif name in ("name", "type"):
raise AttributeError("%s attribute is readonly" % name)
else:
self.__dict__[name] = value
def pairs(self):
name = self.name
value = self.value
if name is None or value is None or self.disabled:
return []
return [(name, value)]
def __str__(self):
name = self.name
value = self.value
if name is None: name = ""
if value is None: value = ""
infos = []
if self.disabled: infos.append("disabled")
if self.readonly: infos.append("readonly")
info = string.join(infos, ", ")
if info: info = " (%s)" % info
return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
#---------------------------------------------------
class TextControl(ScalarControl):
"""Textual input control.
Covers:
INPUT/TEXT
INPUT/PASSWORD
INPUT/FILE
INPUT/HIDDEN
TEXTAREA
"""
def __init__(self, type, name, attrs):
ScalarControl.__init__(self, type, name, attrs)
if self.type == "hidden": self.readonly = True
if self._value is None:
self._value = ""
#---------------------------------------------------
class FileControl(ScalarControl):
"""File upload with INPUT TYPE=FILE.
The value attribute of a FileControl is always None.
Additional public method: add_file
"""
def __init__(self, type, name, attrs):
ScalarControl.__init__(self, type, name, attrs)
self._value = None
self._upload_data = []
def __setattr__(self, name, value):
if name in ("value", "name", "type"):
raise AttributeError("%s attribute is readonly" % name)
else:
self.__dict__[name] = value
def add_file(self, file_object, content_type=None, filename=None):
if not hasattr(file_object, "read"):
raise TypeError("file-like object must have read method")
if content_type is not None and not isstringlike(content_type):
raise TypeError("content type must be None or string-like")
if filename is not None and not isstringlike(filename):
raise TypeError("filename must be None or string-like")
if content_type is None:
content_type = "application/octet-stream"
self._upload_data.append((file_object, content_type, filename))
def pairs(self):
# XXX should it be successful even if unnamed?
if self.name is None or self.disabled:
return []
return [(self.name, "")]
def _write_mime_data(self, mw):
# called by HTMLForm
if len(self._upload_data) == 1:
# single file
file_object, content_type, filename = self._upload_data[0]
mw2 = mw.nextpart()
fn_part = filename and ('; filename="%s"' % filename) or ''
disp = 'form-data; name="%s"%s' % (self.name, fn_part)
mw2.addheader("Content-disposition", disp, prefix=1)
fh = mw2.startbody(content_type, prefix=0)
fh.write(file_object.read())
elif len(self._upload_data) != 0:
# multiple files
mw2 = mw.nextpart()
disp = 'form-data; name="%s"' % self.name
mw2.addheader("Content-disposition", disp, prefix=1)
fh = mw2.startmultipartbody("mixed", prefix=0)
for file_object, content_type, filename in self._upload_data:
mw3 = mw2.nextpart()
fn_part = filename and ('; filename="%s"' % filename) or ''
disp = 'file%s' % fn_part
mw3.addheader("Content-disposition", disp, prefix=1)
fh2 = mw3.startbody(content_type, prefix=0)
fh2.write(file_object.read())
mw2.lastpart()
def __str__(self):
name = self.name
if name is None: name = ""
if not self._upload_data:
value = ""
else:
value = []
for file, ctype, filename in self._upload_data:
if filename is None:
value.append("")
else:
value.append(filename)
value = string.join(value, ", ")
info = []
if self.disabled: info.append("disabled")
if self.readonly: info.append("readonly")
info = string.join(info, ", ")
if info: info = " (%s)" % info
return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
#---------------------------------------------------
class IsindexControl(ScalarControl):
"""ISINDEX control.
ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really
part of regular HTML forms at all, and predates it. You're only allowed
one ISINDEX per HTML document. ISINDEX and regular form submission are
mutually exclusive -- either submit a form, or the ISINDEX.
Having said this, since ISINDEX controls may appear in forms (which is
probably bad HTML), ParseFile / ParseResponse will include them in the
HTMLForm instances it returns. You can set the ISINDEX's value, as with
any other control (but note that ISINDEX controls have no name, so you'll
need to use the type argument of set_value!). When you submit the form,
the ISINDEX will not be successful (ie., no data will get returned to the
server as a result of its presence), unless you click on the ISINDEX
control, in which case the ISINDEX gets submitted instead of the form:
form.set_value("my isindex value", type="isindex")
urllib2.urlopen(form.click(type="isindex"))
ISINDEX elements outside of FORMs are ignored. If you want to submit one
by hand, do it like so:
url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value"))
result = urllib2.urlopen(url)
"""
def __init__(self, type, name, attrs):
ScalarControl.__init__(self, type, name, attrs)
if self._value is None:
self._value = ""
def pairs(self):
return []
def _click(self, form, coord, return_type):
# Relative URL for ISINDEX submission: instead of "foo=bar+baz",
# want "bar+baz".
# This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is
# deprecated in 4.01, but it should still say how to submit it).
# Submission of ISINDEX is explained in the HTML 3.2 spec, though.
url = urljoin(form.action, "?"+urllib.quote_plus(self.value))
req_data = url, None, []
if return_type == "pairs":
return []
elif return_type == "request_data":
return req_data
else:
return urllib2.Request(url)
def __str__(self):
value = self.value
if value is None: value = ""
infos = []
if self.disabled: infos.append("disabled")
if self.readonly: infos.append("readonly")
info = string.join(infos, ", ")
if info: info = " (%s)" % info
return "<%s(%s)%s>" % (self.__class__.__name__, value, info)
#---------------------------------------------------
class IgnoreControl(ScalarControl):
"""Control that we're not interested in.
Covers:
INPUT/RESET
BUTTON/RESET
INPUT/BUTTON
BUTTON/BUTTON
These controls are always unsuccessful, in the terminology of HTML 4 (ie.
they never require any information to be returned to the server).
BUTTON/BUTTON is used to generate events for script embedded in HTML.
The value attribute of IgnoreControl is always None.
"""
def __init__(self, type, name, attrs):
ScalarControl.__init__(self, type, name, attrs)
self._value = None
def __setattr__(self, name, value):
if name == "value":
raise AttributeError(
"control '%s' is ignored, hence read-only" % self.name)
elif name in ("name", "type"):
raise AttributeError("%s attribute is readonly" % name)
else:
self.__dict__[name] = value
#---------------------------------------------------
class ListControl(Control):
"""Control representing a sequence of items.
The value attribute of a ListControl represents the selected list items in
the control.
ListControl implements both list controls that take a single value and
those that take multiple values.
ListControls accept sequence values only. Some controls only accept
sequences of length 0 or 1 (RADIO, and single-selection SELECT).
In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes
and multiple-selection SELECTs (those having the "multiple" HTML attribute)
accept sequences of any length.
Note the following mistake:
control.value = some_value
assert control.value == some_value # not necessarily true
The reason for this is that the value attribute always gives the list items
in the order they were listed in the HTML.
ListControl items can also be referred to by their labels instead of names.
Use the by_label argument, and the set_value_by_label, get_value_by_label
methods.
XXX RadioControl and CheckboxControl don't implement by_label yet.
Note that, rather confusingly, though SELECT controls are represented in
HTML by SELECT elements (which contain OPTION elements, representing
individual list items), CHECKBOXes and RADIOs are not represented by *any*
element. Instead, those controls are represented by a collection of INPUT
elements. For example, this is a SELECT control, named "control1":
and this is a CHECKBOX control, named "control2":
The id attribute of a CHECKBOX or RADIO ListControl is always that of its
first element (for example, "cbe1" above).
Additional read-only public attribute: multiple.
ListControls are built up by the parser from their component items by
creating one ListControl per item, consolidating them into a single master
ListControl held by the HTMLForm:
-User calls form.new_control(...)
-Form creates Control, and calls control.add_to_form(self).
-Control looks for a Control with the same name and type in the form, and
if it finds one, merges itself with that control by calling
control.merge_control(self). The first Control added to the form, of a
particular name and type, is the only one that survives in the form.
-Form calls control.fixup for all its controls. ListControls in the form
know they can now safely pick their default values.
To create a ListControl without an HTMLForm, use:
control.merge_control(new_control)
"""
def __init__(self, type, name, attrs={}, select_default=False,
called_as_base_class=False):
"""
select_default: for RADIO and multiple-selection SELECT controls, pick
the first item as the default if no 'selected' HTML attribute is
present
"""
if not called_as_base_class:
raise NotImplementedError()
self.__dict__["type"] = string.lower(type)
self.__dict__["name"] = name
self._value = attrs.get("value")
self.disabled = False
self.readonly = False
self.id = attrs.get("id")
self._attrs = attrs.copy()
# As Controls are merged in with .merge_control(), self._attrs will
# refer to each Control in turn -- always the most recently merged
# control. Each merged-in Control instance corresponds to a single
# list item: see ListControl.__doc__.
if attrs:
self._attrs_list = [self._attrs] # extended by .merge_control()
self._disabled_list = [self._attrs.has_key("disabled")] # ditto
else:
self._attrs_list = [] # extended by .merge_control()
self._disabled_list = [] # ditto
self._select_default = select_default
self._clicked = False
# Some list controls can have their default set only after all items
# are known. If so, self._value_is_set is false, and the self.fixup
# method, called after all items have been added, sets the default.
self._value_is_set = False
def _value_from_label(self, label):
raise NotImplementedError("control '%s' does not yet support "
"by_label" % self.name)
def toggle(self, name, by_label=False):
return self._set_selected_state(name, 2, by_label)
def set(self, selected, name, by_label=False):
action = int(bool(selected))
return self._set_selected_state(name, action, by_label)
def _set_selected_state(self, name, action, by_label):
"""
name: item name
action:
0: clear
1: set
2: toggle
"""
if not isstringlike(name):
raise TypeError("item name must be string-like")
if self.disabled:
raise AttributeError("control '%s' is disabled" % self.name)
if self.readonly:
raise AttributeError("control '%s' is readonly" % self.name)
if by_label:
name = self._value_from_label(name)
try:
i = self._menu.index(name)
except ValueError:
raise ItemNotFoundError("no item named '%s'" % name)
if self.multiple:
if action == 2:
action = not self._selected[i]
if action and self._disabled_list[i]:
raise AttributeError("item '%s' is disabled" % name)
self._selected[i] = bool(action)
else:
if action == 2:
if self._selected == name:
action = 0
else:
action = 1
if action == 0 and self._selected == name:
self._selected = None
elif action == 1:
if self._disabled_list[i]:
raise AttributeError("item '%s' is disabled" % name)
self._selected = name
def toggle_single(self, by_label=False):
self._set_single_selected_state(2, by_label)
def set_single(self, selected, by_label=False):
action = int(bool(selected))
self._set_single_selected_state(action, by_label)
def _set_single_selected_state(self, action, by_label):
if len(self._menu) != 1:
raise ItemCountError("'%s' is not a single-item control" %
self.name)
name = self._menu[0]
if by_label:
name = self._value_from_label(name)
self._set_selected_state(name, action, by_label)
def get_item_disabled(self, name, by_label=False):
"""Get disabled state of named list item in a ListControl."""
if by_label:
name = self._value_from_label(name)
try:
i = self._menu.index(name)
except ValueError:
raise ItemNotFoundError()
else:
return self._disabled_list[i]
def set_item_disabled(self, disabled, name, by_label=False):
"""Set disabled state of named list item in a ListControl.
disabled: boolean disabled state
"""
if by_label:
name = self._value_from_label(name)
try:
i = self._menu.index(name)
except ValueError:
raise ItemNotFoundError()
else:
self._disabled_list[i] = bool(disabled)
def set_all_items_disabled(self, disabled):
"""Set disabled state of all list items in a ListControl.
disabled: boolean disabled state
"""
for i in range(len(self._disabled_list)):
self._disabled_list[i] = bool(disabled)
def get_item_attrs(self, name, by_label=False):
"""Return dictionary of HTML attributes for a single ListControl item.
The HTML element types that describe list items are: OPTION for SELECT
controls, INPUT for the rest. These elements have HTML attributes that
you may occasionally want to know about -- for example, the "alt" HTML
attribute gives a text string describing the item (graphical browsers
usually display this as a tooltip).
The returned dictionary maps HTML attribute names to values. The names
and values are taken from the original HTML.
Note that for SELECT controls, the returned dictionary contains a
special key "contents" -- see SelectControl.__doc__.
"""
if by_label:
name = self._value_from_label(name)
try:
i = self._menu.index(name)
except ValueError:
raise ItemNotFoundError()
return self._attrs_list[i]
def add_to_form(self, form):
try:
control = form.find_control(self.name, self.type)
except ControlNotFoundError:
Control.add_to_form(self, form)
else:
control.merge_control(self)
def merge_control(self, control):
assert bool(control.multiple) == bool(self.multiple)
assert isinstance(control, self.__class__)
self._menu.extend(control._menu)
self._attrs_list.extend(control._attrs_list)
self._disabled_list.extend(control._disabled_list)
if control.multiple:
self._selected.extend(control._selected)
else:
if control._value_is_set:
self._selected = control._selected
if control._value_is_set:
self._value_is_set = True
def fixup(self):
"""
ListControls are built up from component list items (which are also
ListControls) during parsing. This method should be called after all
items have been added. See ListControl.__doc__ for the reason this is
required.
"""
# Need to set default selection where no item was indicated as being
# selected by the HTML:
# CHECKBOX:
# Nothing should be selected.
# SELECT/single, SELECT/multiple and RADIO:
# RFC 1866 (HTML 2.0): says first item should be selected.
# W3C HTML 4.01 Specification: says that client behaviour is
# undefined in this case. For RADIO, exactly one must be selected,
# though which one is undefined.
# Both Netscape and Microsoft Internet Explorer (IE) choose first
# item for SELECT/single. However, both IE5 and Mozilla (both 1.0
# and Firebird 0.6) leave all items unselected for RADIO and
# SELECT/multiple.
# Since both Netscape and IE all choose the first item for
# SELECT/single, we do the same. OTOH, both Netscape and IE
# leave SELECT/multiple with nothing selected, in violation of RFC 1866
# (but not in violation of the W3C HTML 4 standard); the same is true
# of RADIO (which *is* in violation of the HTML 4 standard). We follow
# RFC 1866 if the select_default attribute is set, and Netscape and IE
# otherwise. RFC 1866 and HTML 4 are always violated insofar as you
# can deselect all items in a RadioControl.
raise NotImplementedError()
def __getattr__(self, name):
if name == "value":
menu = self._menu
if self.multiple:
values = []
for i in range(len(menu)):
if self._selected[i]: values.append(menu[i])
return values
else:
if self._selected is None: return []
else: return [self._selected]
else:
raise AttributeError("%s instance has no attribute '%s'" %
(self.__class__.__name__, name))
def __setattr__(self, name, value):
if name == "value":
if self.disabled:
raise AttributeError("control '%s' is disabled" % self.name)
if self.readonly:
raise AttributeError("control '%s' is readonly" % self.name)
self._set_value(value)
elif name in ("name", "type", "multiple"):
raise AttributeError("%s attribute is readonly" % name)
else:
self.__dict__[name] = value
def _set_value(self, value):
if self.multiple:
self._multiple_set_value(value)
else:
self._single_set_value(value)
def _single_set_value(self, value):
if value is None or isstringlike(value):
raise TypeError("ListControl, must set a sequence")
nr = len(value)
if not (0 <= nr <= 1):
raise ItemCountError("single selection list, must set sequence of "
"length 0 or 1")
if nr == 0:
self._selected = None
else:
value = value[0]
try:
i = self._menu.index(value)
except ValueError:
raise ItemNotFoundError("no item named '%s'" %
repr(value))
if self._disabled_list[i]:
raise AttributeError("item '%s' is disabled" % value)
self._selected = value
def _multiple_set_value(self, value):
if value is None or isstringlike(value):
raise TypeError("ListControl, must set a sequence")
selected = [False]*len(self._selected)
menu = self._menu
disabled_list = self._disabled_list
for v in value:
found = False
for i in range(len(menu)):
item_name = menu[i]
if v == item_name:
if disabled_list[i]:
raise AttributeError("item '%s' is disabled" % value)
selected[i] = True
found = True
break
if not found:
raise ItemNotFoundError("no item named '%s'" % repr(v))
self._selected = selected
def set_value_by_label(self, value):
raise NotImplementedError("control '%s' does not yet support "
"by_label" % self.name)
def get_value_by_label(self):
raise NotImplementedError("control '%s' does not yet support "
"by_label" % self.name)
def possible_items(self, by_label=False):
if by_label:
raise NotImplementedError(
"control '%s' does not yet support by_label" % self.name)
return copy.copy(self._menu)
def pairs(self):
if self.disabled:
return []
if not self.multiple:
name = self.name
value = self._selected
if name is None or value is None:
return []
return [(name, value)]
else:
control_name = self.name # usually the name HTML attribute
pairs = []
for i in range(len(self._menu)):
item_name = self._menu[i] # usually the value HTML attribute
if self._selected[i]:
pairs.append((control_name, item_name))
return pairs
def _item_str(self, i):
item_name = self._menu[i]
if self.multiple:
if self._selected[i]:
item_name = "*"+item_name
else:
if self._selected == item_name:
item_name = "*"+item_name
if self._disabled_list[i]:
item_name = "(%s)" % item_name
return item_name
def __str__(self):
name = self.name
if name is None: name = ""
display = []
for i in range(len(self._menu)):
s = self._item_str(i)
display.append(s)
infos = []
if self.disabled: infos.append("disabled")
if self.readonly: infos.append("readonly")
info = string.join(infos, ", ")
if info: info = " (%s)" % info
return "<%s(%s=[%s])%s>" % (self.__class__.__name__,
name, string.join(display, ", "), info)
class RadioControl(ListControl):
"""
Covers:
INPUT/RADIO
"""
def __init__(self, type, name, attrs, select_default=False):
ListControl.__init__(self, type, name, attrs, select_default,
called_as_base_class=True)
self.__dict__["multiple"] = False
value = attrs.get("value", "on")
self._menu = [value]
checked = attrs.has_key("checked")
if checked:
self._value_is_set = True
self._selected = value
else:
self._selected = None
def fixup(self):
if not self._value_is_set:
# no item explicitly selected
assert self._selected is None
if self._select_default:
self._selected = self._menu[0]
self._value_is_set = True
class CheckboxControl(ListControl):
"""
Covers:
INPUT/CHECKBOX
"""
def __init__(self, type, name, attrs, select_default=False):
ListControl.__init__(self, type, name, attrs, select_default,
called_as_base_class=True)
self.__dict__["multiple"] = True
value = attrs.get("value", "on")
self._menu = [value]
checked = attrs.has_key("checked")
self._selected = [checked]
self._value_is_set = True
def fixup(self):
# If no items were explicitly checked in HTML, that's how we must
# leave it, so we have nothing to do here.
assert self._value_is_set
class SelectControl(ListControl):
"""
Covers:
SELECT (and OPTION)
SELECT control values and labels are subject to some messy defaulting
rules. For example, if the HTML repreentation of the control is:
The items, in order, have labels "2002", "2001" and "2000", whereas their
values are "0", "1" and "2000" respectively. Note that the value of the
last OPTION in this example defaults to its contents, as specified by RFC
1866, as do the labels of the second and third OPTIONs.
The purpose of these methods is that the OPTION labels are sometimes much
more meaningful, than are the OPTION values, which can make for more
maintainable code.
Additional read-only public attribute: attrs
The attrs attribute is a dictionary of the original HTML attributes of the
SELECT element. Other ListControls do not have this attribute, because in
other cases the control as a whole does not correspond to any single HTML
element. The get_item_attrs method may be used as usual to get at the
HTML attributes of the HTML elements corresponding to individual list items
(for SELECT controls, these are OPTION elements).
Another special case is that the attributes dictionaries returned by
get_item_attrs have a special key "contents" which does not correspond to
any real HTML attribute, but rather contains the contents of the OPTION
element:
"""
# HTML attributes here are treated slightly from other list controls:
# -The SELECT HTML attributes dictionary is stuffed into the OPTION
# HTML attributes dictionary under the "__select" key.
# -The content of each OPTION element is stored under the special
# "contents" key of the dictionary.
# After all this, the dictionary is passed to the SelectControl constructor
# as the attrs argument, as usual. However:
# -The first SelectControl constructed when building up a SELECT control
# has a constructor attrs argument containing only the __select key -- so
# this SelectControl represents an empty SELECT control.
# -Subsequent SelectControls have both OPTION HTML-attribute in attrs and
# the __select dictionary containing the SELECT HTML-attributes.
def __init__(self, type, name, attrs, select_default=False):
# fish out the SELECT HTML attributes from the OPTION HTML attributes
# dictionary
self.attrs = attrs["__select"].copy()
attrs = attrs.copy()
del attrs["__select"]
ListControl.__init__(self, type, name, attrs, select_default,
called_as_base_class=True)
self._label_map = None
self.disabled = self.attrs.has_key("disabled")
self.id = self.attrs.get("id")
self._menu = []
self._selected = []
self._value_is_set = False
if self.attrs.has_key("multiple"):
self.__dict__["multiple"] = True
self._selected = []
else:
self.__dict__["multiple"] = False
self._selected = None
if attrs: # OPTION item data was provided
value = attrs["value"]
self._menu.append(value)
selected = attrs.has_key("selected")
if selected:
self._value_is_set = True
if self.attrs.has_key("multiple"):
self._selected.append(selected)
elif selected:
self._selected = value
def _build_select_label_map(self):
"""Return an ordered mapping of labels to values.
For example, if the HTML repreentation of the control is as given in
SelectControl.__doc__, this function will return a mapping like:
{"2002": "0", "2001": "1", "2000": "2000"}
"""
alist = []
for val in self._menu:
attrs = self.get_item_attrs(val)
alist.append((attrs["label"], val))
return AList(alist)
def _value_from_label(self, label):
try:
return self._label_map[label]
except KeyError:
raise ItemNotFoundError("no item has label '%s'" % label)
def fixup(self):
if not self._value_is_set:
# No item explicitly selected.
if len(self._menu) > 0:
if self.multiple:
if self._select_default:
self._selected[0] = True
else:
assert self._selected is None
self._selected = self._menu[0]
self._value_is_set = True
self._label_map = self._build_select_label_map()
def possible_items(self, by_label=False):
if not by_label:
return copy.copy(self._menu)
else:
self._label_map.set_inverted(True)
try:
r = map(lambda v, self=self: self._label_map[v], self._menu)
finally:
self._label_map.set_inverted(False)
return r
def set_value_by_label(self, value):
if isstringlike(value):
raise TypeError("ListControl, must set a sequence, not a string")
if self.disabled:
raise AttributeError("control '%s' is disabled" % self.name)
if self.readonly:
raise AttributeError("control '%s' is readonly" % self.name)
try:
value = map(lambda v, self=self: self._label_map[v], value)
except KeyError, e:
raise ItemNotFoundError("no item has label '%s'" % e.args[0])
self._set_value(value)
def get_value_by_label(self):
menu = self._menu
self._label_map.set_inverted(True)
try:
if self.multiple:
values = []
for i in range(len(menu)):
if self._selected[i]:
values.append(self._label_map[menu[i]])
return values
else:
return [self._label_map[self._selected]]
finally:
self._label_map.set_inverted(False)
#---------------------------------------------------
class SubmitControl(ScalarControl):
"""
Covers:
INPUT/SUBMIT
BUTTON/SUBMIT
"""
def __init__(self, type, name, attrs):
ScalarControl.__init__(self, type, name, attrs)
# IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it
# blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem
# to define this.
if self.value is None: self.value = ""
self.readonly = True
def _click(self, form, coord, return_type):
self._clicked = coord
r = form._switch_click(return_type)
self._clicked = False
return r
def pairs(self):
if not self._clicked:
return []
return ScalarControl.pairs(self)
#---------------------------------------------------
class ImageControl(SubmitControl):
"""
Covers:
INPUT/IMAGE
The value attribute of an ImageControl is always None. Coordinates are
specified using one of the HTMLForm.click* methods.
"""
def __init__(self, type, name, attrs):
ScalarControl.__init__(self, type, name, attrs)
self.__dict__["value"] = None
def __setattr__(self, name, value):
if name in ("value", "name", "type"):
raise AttributeError("%s attribute is readonly" % name)
else:
self.__dict__[name] = value
def pairs(self):
clicked = self._clicked
if self.disabled or not clicked:
return []
name = self.name
if name is None: return []
return [("%s.x" % name, str(clicked[0])),
("%s.y" % name, str(clicked[1]))]
# aliases, just to make str(control) and str(form) clearer
class PasswordControl(TextControl): pass
class HiddenControl(TextControl): pass
class TextareaControl(TextControl): pass
class SubmitButtonControl(SubmitControl): pass
def is_listcontrol(control): return isinstance(control, ListControl)
class HTMLForm:
"""Represents a single HTML element.
A form consists of a sequence of controls that usually have names, and
which can take on various values. The values of the various types of
controls represent variously: text, zero-, one- or many-of-many choices,
and files to be uploaded.
Forms can be filled in with data to be returned to the server, and then
submitted, using the click method to generate a request object suitable for
passing to urllib2.urlopen (or the click_request_data or click_pairs
methods if you're not using urllib2).
import ClientForm
forms = ClientForm.ParseFile(html, base_uri)
form = forms[0]
form["query"] = "Python"
form.set("lots", "nr_results")
response = urllib2.urlopen(form.click())
Usually, HTMLForm instances are not created directly. Instead, the
ParseFile or ParseResponse factory functions are used. If you do construct
HTMLForm objects yourself, however, note that an HTMLForm instance is only
properly initialised after the fixup method has been called (ParseFile and
ParseResponse do this for you). See ListControl.__doc__ for the reason
this is required.
Indexing a form (form["control_name"]) returns the named Control's value
attribute. Assignment to a form index (form["control_name"] = something)
is equivalent to assignment to the named Control's value attribute. If you
need to be more specific than just supplying the control's name, use the
set_value and get_value methods.
ListControl values are lists of item names. The list item's name is the
value of the corresponding HTML element's "value" attribute.
Example:
defines a CHECKBOX control with name "cheeses" which has two items, named
"leicester" and "cheddar".
Another example:
defines a SELECT control with name "more_cheeses" which has two items,
named "1" and "2".
To set, clear or toggle individual list items, use the set and toggle
methods. To set the whole value, do as for any other control:use indexing
or the set_/get_value methods.
Example:
# select *only* the item named "cheddar"
form["cheeses"] = ["cheddar"]
# select "cheddar", leave other items unaffected
form.set("cheddar", "cheeses")
Some controls (RADIO and SELECT without the multiple attribute) can only
have zero or one items selected at a time. Some controls (CHECKBOX and
SELECT with the multiple attribute) can have multiple items selected at a
time. To set the whole value of a multiple-selection ListControl, assign a
sequence to a form index:
form["cheeses"] = ["cheddar", "leicester"]
To check whether a control has an item, or whether an item is selected,
respectively:
"cheddar" in form.possible_items("cheeses")
"cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses"))
Note that some items may be disabled (see below).
Note the following mistake:
form[control_name] = control_value
assert form[control_name] == control_value # not necessarily true
The reason for this is that form[control_name] always gives the list items
in the order they were listed in the HTML.
List items (hence list values, too) can be referred to in terms of list
item labels rather than list item names. Currently, this is only possible
for SELECT controls (this is a bug). To use this feature, use the by_label
arguments to the various HTMLForm methods. Note that it is *item* names
(hence ListControl values also), not *control* names, that can be referred
to by label.
The question of default values of OPTION contents, labels and values is
somewhat complicated: see SelectControl.__doc__ and
ListControl.get_item_attrs.__doc__ if you think you need to know.
Controls can be disabled or readonly. In either case, the control's value
cannot be changed until you clear those flags (using the methods on
HTMLForm). Disabled is the state typically represented by browsers by
`greying out' a control. Disabled controls are not `successful' -- they
don't cause data to get returned to the server. Readonly controls usually
appear in browsers as read-only text boxes. Readonly controls are
successful. List items can also be disabled. Attempts to select disabled
items (with form[name] = value, or using the ListControl.set method, for
example) fail. Attempts to clear disabled items are allowed.
If a lot of controls are readonly, it can be useful to do this:
form.set_all_readonly(False)
When you want to do several things with a single control, or want to do
less common things, like changing which controls and items are disabled,
you can get at a particular control:
control = form.find_control("cheeses")
control.set_item_disabled(False, "gruyere")
control.set("gruyere")
Most methods on HTMLForm just delegate to the contained controls, so see
the docstrings of the various Control classes for further documentation.
Most of these delegating methods take name, type, kind, id and nr arguments
to specify the control to be operated on: see
HTMLForm.find_control.__doc__.
ControlNotFoundError (subclass of ValueError) is raised if the specified
control can't be found. This includes occasions where a non-ListControl
is found, but the method (set, for example) requires a ListControl.
ItemNotFoundError (subclass of ValueError) is raised if a list item can't
be found. ItemCountError (subclass of ValueError) is raised if an attempt
is made to select more than one item and the control doesn't allow that, or
set/get_single are called and the control contains more than one item.
AttributeError is raised if a control or item is readonly or disabled and
an attempt is made to alter its value.
XXX CheckBoxControl and RadioControl don't yet support item access by label
Security note: Remember that any passwords you store in HTMLForm instances
will be saved to disk in the clear if you pickle them (directly or
indirectly). The simplest solution to this is to avoid pickling HTMLForm
objects. You could also pickle before filling in any password, or just set
the password to "" before pickling.
Public attributes:
action: full (absolute URI) form action
method: "GET" or "POST"
enctype: form transfer encoding MIME type
name: name of form (None if no name was specified)
attrs: dictionary mapping original HTML form attributes to their values
controls: list of Control instances; do not alter this list
(instead, call form.new_control to make a Control and add it to the
form, or control.add_to_form if you already have a Control instance)
Methods for form filling:
-------------------------
Most of the these methods have very similar arguments. See
HTMLForm.find_control.__doc__ for details of the name, type, kind and nr
arguments. See above for a description of by_label.
def find_control(self,
name=None, type=None, kind=None, id=None, predicate=None,
nr=None)
get_value(name=None, type=None, kind=None, id=None, nr=None,
by_label=False)
set_value(value,
name=None, type=None, kind=None, id=None, nr=None,
by_label=False)
set_all_readonly(readonly)
Methods applying only to ListControls:
possible_items(name=None, type=None, kind=None, id=None, nr=None,
by_label=False)
set(selected, item_name,
name=None, type=None, kind=None, id=None, nr=None,
by_label=False)
toggle(item_name,
name=None, type=None, id=None, nr=None,
by_label=False)
set_single(selected,
name=None, type=None, kind=None, id=None, nr=None,
by_label=False)
toggle_single(name=None, type=None, kind=None, id=None, nr=None,
by_label=False)
Method applying only to FileControls:
add_file(file_object,
content_type="application/octet-stream", filename=None,
name=None, id=None, nr=None)
Methods applying only to clickable controls:
click(name=None, type=None, id=None, nr=0, coord=(1,1))
click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1))
click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1))
"""
type2class = {
"text": TextControl,
"password": PasswordControl,
"hidden": HiddenControl,
"textarea": TextareaControl,
"isindex": IsindexControl,
"file": FileControl,
"button": IgnoreControl,
"buttonbutton": IgnoreControl,
"reset": IgnoreControl,
"resetbutton": IgnoreControl,
"submit": SubmitControl,
"submitbutton": SubmitButtonControl,
"image": ImageControl,
"radio": RadioControl,
"checkbox": CheckboxControl,
"select": SelectControl,
}
#---------------------------------------------------
# Initialisation. Use ParseResponse / ParseFile instead.
def __init__(self, action, method="GET",
enctype="application/x-www-form-urlencoded",
name=None, attrs=None):
"""
In the usual case, use ParseResponse (or ParseFile) to create new
HTMLForm objects.
action: full (absolute URI) form action
method: "GET" or "POST"
enctype: form transfer encoding MIME type
name: name of form
attrs: dictionary mapping original HTML form attributes to their values
"""
self.action = action
self.method = method
self.enctype = enctype
self.name = name
if attrs is not None:
self.attrs = attrs.copy()
else:
self.attrs = {}
self.controls = []
def new_control(self, type, name, attrs,
ignore_unknown=False, select_default=False):
"""Adds a new control to the form.
This is usually called by ParseFile and ParseResponse. Don't call it
youself unless you're building your own Control instances.
Note that controls representing lists of items are built up from
controls holding only a single list item. See ListControl.__doc__ for
further information.
type: type of control (see Control.__doc__ for a list)
attrs: HTML attributes of control
ignore_unknown: if true, use a dummy Control instance for controls of
unknown type; otherwise, raise ValueError
select_default: for RADIO and multiple-selection SELECT controls, pick
the first item as the default if no 'selected' HTML attribute is
present (this defaulting happens when the HTMLForm.fixup method is
called)
"""
type = string.lower(type)
klass = self.type2class.get(type)
if klass is None:
if ignore_unknown:
klass = IgnoreControl
else:
raise ValueError("Unknown control type '%s'" % type)
a = attrs.copy()
if issubclass(klass, ListControl):
control = klass(type, name, a, select_default)
else:
control = klass(type, name, a)
control.add_to_form(self)
def fixup(self):
"""Normalise form after all controls have been added.
This is usually called by ParseFile and ParseResponse. Don't call it
youself unless you're building your own Control instances.
This method should only be called once, after all controls have been
added to the form.
"""
for control in self.controls:
control.fixup()
#---------------------------------------------------
def __str__(self):
header = "%s %s %s" % (self.method, self.action, self.enctype)
rep = [header]
for control in self.controls:
rep.append(" %s" % str(control))
return "<%s>" % string.join(rep, "\n")
#---------------------------------------------------
# Form-filling methods.
def __getitem__(self, name):
return self.find_control(name).value
def __setitem__(self, name, value):
control = self.find_control(name)
try:
control.value = value
except AttributeError, e:
raise ValueError(str(e))
def get_value(self,
name=None, type=None, kind=None, id=None, nr=None,
by_label=False):
"""Return value of control.
If only name and value arguments are supplied, equivalent to
form[name]
"""
c = self.find_control(name, type, kind, id, nr=nr)
if by_label:
try:
meth = c.get_value_by_label
except AttributeError:
raise NotImplementedError(
"control '%s' does not yet support by_label" % c.name)
else:
return meth()
else:
return c.value
def set_value(self, value,
name=None, type=None, kind=None, id=None, nr=None,
by_label=False):
"""Set value of control.
If only name and value arguments are supplied, equivalent to
form[name] = value
"""
c = self.find_control(name, type, kind, id, nr=nr)
if by_label:
try:
meth = c.set_value_by_label
except AttributeError:
raise NotImplementedError(
"control '%s' does not yet support by_label" % c.name)
else:
meth(value)
else:
c.value = value
def set_all_readonly(self, readonly):
for control in self.controls:
control.readonly = bool(readonly)
#---------------------------------------------------
# Form-filling methods applying only to ListControls.
def possible_items(self,
name=None, type=None, kind=None, id=None, nr=None,
by_label=False):
"""Return a list of all values that the specified control can take."""
c = self._find_list_control(name, type, kind, id, nr)
return c.possible_items(by_label)
def set(self, selected, item_name,
name=None, type=None, kind=None, id=None, nr=None,
by_label=False):
"""Select / deselect named list item.
selected: boolean selected state
"""
self._find_list_control(name, type, kind, id, nr).set(
selected, item_name, by_label)
def toggle(self, item_name,
name=None, type=None, kind=None, id=None, nr=None,
by_label=False):
"""Toggle selected state of named list item."""
self._find_list_control(name, type, kind, id, nr).toggle(
item_name, by_label)
def set_single(self, selected,
name=None, type=None, kind=None, id=None, nr=None,
by_label=False):
"""Select / deselect list item in a control having only one item.
If the control has multiple list items, ItemCountError is raised.
This is just a convenience method, so you don't need to know the item's
name -- the item name in these single-item controls is usually
something meaningless like "1" or "on".
For example, if a checkbox has a single item named "on", the following
two calls are equivalent:
control.toggle("on")
control.toggle_single()
"""
self._find_list_control(name, type, kind, id, nr).set_single(
selected, by_label)
def toggle_single(self, name=None, type=None, kind=None, id=None, nr=None,
by_label=False):
"""Toggle selected state of list item in control having only one item.
The rest is as for HTMLForm.set_single.__doc__.
"""
self._find_list_control(name, type, kind, id, nr).toggle_single(
by_label)
#---------------------------------------------------
# Form-filling method applying only to FileControls.
def add_file(self, file_object, content_type=None, filename=None,
name=None, id=None, nr=None):
"""Add a file to be uploaded.
file_object: file-like object (with read method) from which to read
data to upload
content_type: MIME content type of data to upload
filename: filename to pass to server
If filename is None, no filename is sent to the server.
If content_type is None, the content type is guessed based on the
filename and the data from read from the file object.
XXX
At the moment, guessed content type is always application/octet-stream.
Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and
plain text.
"""
self.find_control(name, "file", id=id, nr=nr).add_file(
file_object, content_type, filename)
#---------------------------------------------------
# Form submission methods, applying only to clickable controls.
def click(self, name=None, type=None, id=None, nr=0, coord=(1,1)):
"""Return request that would result from clicking on a control.
The request object is a urllib2.Request instance, which you can pass to
urllib2.urlopen (or ClientCookie.urlopen).
Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and
IMAGEs) can be clicked.
Will click on the first clickable control, subject to the name, type
and nr arguments (as for find_control). If no name, type, id or number
is specified and there are no clickable controls, a request will be
returned for the form in its current, un-clicked, state.
IndexError is raised if any of name, type, id or nr is specified but no
matching control is found. ValueError is raised if the HTMLForm has an
enctype attribute that is not recognised.
You can optionally specify a coordinate to click at, which only makes a
difference if you clicked on an image.
"""
return self._click(name, type, id, nr, coord, "request")
def click_request_data(self,
name=None, type=None, id=None, nr=0, coord=(1,1)):
"""As for click method, but return a tuple (url, data, headers).
You can use this data to send a request to the server. This is useful
if you're using httplib or urllib rather than urllib2. Otherwise, use
the click method.
# Untested. Have to subclass to add headers, I think -- so use urllib2
# instead!
import urllib
url, data, hdrs = form.click_request_data()
r = urllib.urlopen(url, data)
# Untested. I don't know of any reason to use httplib -- you can get
# just as much control with urllib2.
import httplib, urlparse
url, data, hdrs = form.click_request_data()
tup = urlparse(url)
host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:])
conn = httplib.HTTPConnection(host)
if data:
httplib.request("POST", path, data, hdrs)
else:
httplib.request("GET", path, headers=hdrs)
r = conn.getresponse()
"""
return self._click(name, type, id, nr, coord, "request_data")
def click_pairs(self, name=None, type=None, id=None, nr=0, coord=(1,1)):
"""As for click_request_data, but returns a list of (key, value) pairs.
You can use this list as an argument to ClientForm.urlencode. This is
usually only useful if you're using httplib or urllib rather than
urllib2 or ClientCookie. It may also be useful if you want to manually
tweak the keys and/or values, but this should not be necessary.
Otherwise, use the click method.
Note that this method is only useful for forms of MIME type
x-www-form-urlencoded. In particular, it does not return the
information required for file upload. If you need file upload and are
not using urllib2, use click_request_data.
Also note that Python 2.0's urllib.urlencode is slightly broken: it
only accepts a mapping, not a sequence of pairs, as an argument. This
messes up any ordering in the argument. Use ClientForm.urlencode
instead.
"""
return self._click(name, type, id, nr, coord, "pairs")
#---------------------------------------------------
def find_control(self,
name=None, type=None, kind=None, id=None, predicate=None,
nr=None):
"""Locate some specific control within the form.
At least one of the name, type, kind, predicate and nr arguments must
be supplied. If no matching control is found, ControlNotFoundError is
raised.
If name is specified, then the control must have the indicated name.
If type is specified then the control must have the specified type (in
addition to the types possible for HTML tags: "text",
"password", "hidden", "submit", "image", "button", "radio", "checkbox",
"file" we also have "reset", "buttonbutton", "submitbutton",
"resetbutton", "textarea", "select" and "isindex").
If kind is specified, then the control must fall into the specified
group, each of which satisfies a particular interface. The types are
"text", "list", "multilist", "singlelist", "clickable" and "file".
If id is specified, then the control must have the indicated id.
If predicate is specified, then the control must match that function.
The predicate function is passed the control as its single argument,
and should return a boolean value indicating whether the control
matched.
nr, if supplied, is the sequence number of the control (where 0 is the
first). Note that control 0 is the first control matching all the
other arguments (if supplied); it is not necessarily the first control
in the form.
"""
if ((name is None) and (type is None) and (kind is None) and
(id is None) and (predicate is None) and (nr is None)):
raise ValueError(
"at least one argument must be supplied to specify control")
if nr is None: nr = 0
return self._find_control(name, type, kind, id, predicate, nr)
#---------------------------------------------------
# Private methods.
def _find_list_control(self,
name=None, type=None, kind=None, id=None, nr=None):
if ((name is None) and (type is None) and (kind is None) and
(id is None) and (nr is None)):
raise ValueError(
"at least one argument must be supplied to specify control")
if nr is None: nr = 0
return self._find_control(name, type, kind, id, is_listcontrol, nr)
def _find_control(self, name, type, kind, id, predicate, nr):
if (name is not None) and not isstringlike(name):
raise TypeError("control name must be string-like")
if (type is not None) and not isstringlike(type):
raise TypeError("control type must be string-like")
if (kind is not None) and not isstringlike(kind):
raise TypeError("control kind must be string-like")
if (id is not None) and not isstringlike(id):
raise TypeError("control id must be string-like")
if (predicate is not None) and not callable(predicate):
raise TypeError("control predicate must be callable")
if nr < 0: raise ValueError("control number must be a positive "
"integer")
orig_nr = nr
for control in self.controls:
if name is not None and name != control.name:
continue
if type is not None and type != control.type:
continue
if (kind is not None and
not self._is_control_in_kind(control, kind)):
continue
if id is not None and id != control.id:
continue
if predicate and not predicate(control):
continue
if nr:
nr = nr - 1
continue
return control
description = []
if name is not None: description.append("name '%s'" % name)
if type is not None: description.append("type '%s'" % type)
if kind is not None: description.append("kind '%s'" % kind)
if id is not None: description.append("id '%s'" % id)
if predicate is not None:
description.append("matching predicate %s" % predicate)
if orig_nr: description.append("nr %d" % orig_nr)
description = string.join(description, ", ")
raise ControlNotFoundError("no control with "+description)
def _is_control_in_kind(self, control, kind):
# XXX not OO
if kind == "list":
return isinstance(control, ListControl)
elif kind == "multilist":
return bool(isinstance(control, ListControl) and control.multiple)
elif kind == "singlelist":
return bool(isinstance(control, ListControl) and
not control.multiple)
elif kind == "file":
return isinstance(control, FileControl)
elif kind == "text":
return isinstance(control, TextControl)
elif kind == "clickable":
return (isinstance(control, SubmitControl) or
isinstance(control, IsindexControl))
else:
raise ValueError("no such control kind '%s'" % kind)
def _click(self, name, type, id, nr, coord, return_type):
try:
control = self._find_control(name, type, "clickable", id, None, nr)
except ControlNotFoundError:
if ((name is not None) or (type is not None) or (id is not None) or
(nr != 0)):
raise
# no clickable controls, but no control was explicitly requested,
# so return state without clicking any control
return self._switch_click(return_type)
else:
return control._click(self, coord, return_type)
def _pairs(self):
"""Return sequence of (key, value) pairs suitable for urlencoding."""
pairs = []
for control in self.controls:
pairs.extend(control.pairs())
return pairs
def _request_data(self):
"""Return a tuple (url, data, headers)."""
method = string.upper(self.method)
if method == "GET":
if self.enctype != "application/x-www-form-urlencoded":
raise ValueError(
"unknown GET form encoding type '%s'" % self.enctype)
uri = "%s?%s" % (self.action, urlencode(self._pairs()))
return uri, None, []
elif method == "POST":
if self.enctype == "application/x-www-form-urlencoded":
return (self.action, urlencode(self._pairs()),
[("Content-type", self.enctype)])
elif self.enctype == "multipart/form-data":
data = StringIO()
http_hdrs = []
mw = MimeWriter(data, http_hdrs)
f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
prefix=0)
for control in self.controls:
control._write_mime_data(mw)
mw.lastpart()
return self.action, data.getvalue(), http_hdrs
else:
raise ValueError(
"unknown POST form encoding type '%s'" % self.enctype)
else:
raise ValueError("Unknown method '%s'" % method)
def _switch_click(self, return_type):
# This is called by HTMLForm and clickable Controls to hide switching
# on return_type.
# XXX
# not OO
# duplicated in IsindexControl._click
if return_type == "pairs":
return self._pairs()
elif return_type == "request_data":
return self._request_data()
else:
req_data = self._request_data()
req = urllib2.Request(req_data[0], req_data[1])
for key, val in req_data[2]:
req.add_header(key, val)
return req