python - __getattr__ in parent class causing subclass __init__ recursion error -
following advice in answer: subclassing beautifulsoup html parser, getting type error, i'm trying use class composition instead of subclassing beautifulsoup
.
the basic scraper class works fine on it's own (at least limited testing).
the scraper class:
from beautifulsoup import beautifulsoup import urllib2 class scrape(): """base class subclassed wrapper providers basic url fetching urllib2 , basic html parsing beautifulsoupץ useful methods provided class composition beautifulsoup. direct access soup class can use _soup property.""" def __init__(self,file): self._file = file #very basic input validation #import re #import urllib2 #from beautifulsoup import beautifulsoup try: self._page = urllib2.urlopen(self._file) #fetching page except (urllib2.urlerror): print ('please enter valid url starting http/https/ftp/file') self._soup = beautifulsoup(self._page) #calling html parser #beautifulsoup.__init__(self,self._page) # next part class compostion part - transform attribute , method calls beautifulsoup class #search functions: self.find = self._soup.find self.findall = self._soup.findall self.__iter__ = self._soup.__iter__ #enables iterating,looping in object self.__len__ = self._soup.__len__ self.__contains__ = self._soup.__contains__ #attribute fetching , setting - __getattr__ implented scraper class self.__setattr__ = self._soup.__setattr__ self.__getattribute__ = self._soup.__getattribute__ #called implement evaluation of self[key] self.__getitem__ = self._soup.__getitem__ self.__setitem__ = self._soup.__setitem__ self.__delitem__ = self._soup.__delitem__ self.__call__ = self._soup.__call__#called when instance “called” function self._getattrmap = self._soup._getattrmap self.has_key = self._soup.has_key #walking html document methods self.contents = self._soup.contents self.text = self._soup.text self.extract = self._soup.extract self.next = self._soup.next self.parent = self._soup.parent self.fetch = self._soup.fetch self.fetchtext = self._soup.fetchtext self.findallnext = self._soup.findallnext self.findchild = self._soup.findchild self.findchildren = self._soup.findchildren self.findnext = self._soup.findnext self.findnextsibling = self._soup.findnextsibling self.first = self._soup.first self.name = self._soup.name self.get = self._soup.get self.getstring = self._soup.getstring # comparison operators or similiar boolean checks self.__eq__ = self._soup.__eq__ self.__ne__ = self._soup.__ne__ self.__hash__ = self._soup.__hash__ self.__nonezero__ = self._soup.__nonzero__ #not sure # class represntation magic methods: self.__str__ = self._soup.__str__ self.__repr__ =self._soup.__repr__ #self.__dict__ = self._soup.__dict__ def __getattr__(self,method): """basically 'magic' method transforms calls unknown attributes , enables traverse html document .notation. example - using instancename.div return first div. explantion: python calls __getattr__ if didn't find method or attribute correspanding call. i'm not sure or right use method """ return self._soup.find(method) def clean(self,work=false,element=false): """clean method provides:basic cleaning of head,scripts etc input 'work' soup object clean unneccesary parts:scripts,head,style has optional variable:'element' can tuple of element enables override element clean""" self._work = work or self._soup self._cleanelements=element or ("head","style","script") #for elem in self._work.findall(self._cleanelements): elem in self.findall(self._cleanelements): elem.extract()
but when subclass sort of recursion loop, can figure.
here subclass (the relevant parts):
class maintraffic(scrape): """class traffic - subclasses scrape class inputs page url , category""" def __init__(self, file, cat, caller = false): if not caller: self._file = file #import urllib2 #self._request = urllib2.request(self._file)# request post show questions scrape.__init__(self,self._file) self.pagecat = cat self.clean(self) self.cleansoup = self.cleantotable(self) self.fetchlinks(self.cleansoup) #self.populatequestiondic() #del (self.cleansoup) def cleantotable(self): pass def fetchlinks(self,fetch): pass def length(self): sqlalchemy import func self.len = session.query(func.count(question.id)).scalar() return int(self.len) def __len__(self): return self.length() def __repr__(self): self.repr = "traffic theory question, current number of questions:{0}".format(self.length()) return self.repr def __getitem__(self,key): try: self._item = session.query(question).filter_by(question_num=key).first() return self._item except (indexerror, keyerror): print "no such key:{0}".format(key)
and here error message:
file "c:\python27\learn\traffic.py", line 117, in __init__ scrape.__init__(self,self._file) file "c:\python27\learn\traffic.py", line 26, in __init__ self._soup = beautifulsoup(self._page) #calling html parser file "c:\python27\learn\traffic.py", line 92, in __getattr__ return self._soup.find(method) file "c:\python27\learn\traffic.py", line 92, in __getattr__ return self._soup.find(method) file "c:\python27\learn\traffic.py", line 92, in __getattr__ return self._soup.find(method) runtimeerror: maximum recursion depth exceeded
i suspect problem me misusing __getattr__
, couldn't figure out should change.
part 1
your code doesn't work because __getattr__()
accesses self._soup
before has been initialized. happens due 4 innocuous-looking lines:
try: self._page = urllib2.urlopen(self._file) except (urllib2.urlerror): print ('please enter valid url starting http/https/ftp/file')
why catch exception , not handle it?
the next line accesses self._page, has not been set yet if urlopen() threw exception:
self._soup = beautifulsoup(self._page)
since hasn't been set, accessing calls __getattr__()
, accesses self._soup
, has not been set yet accesses __getattr__
.
the easiest "fix" special-case _soup prevent infinite recursion. additionally, seems make more sense __getattr__
normal attribute lookup on soup:
def __getattr__(self,attr): if attr == "_soup": raise attributeerror() return getattr(self._soup,attr)
part 2
copying methods on unlikely work well, , seems miss point of class composition entirely.
Comments
Post a Comment