python - __getattr__ in parent class causing subclass __init__ recursion error -
following advice in answer: subclassing beautifulsoup html parser, getting type error, i'm trying use class composition instead of subclassing beautifulsoup.
the basic scraper class works fine on it's own (at least limited testing).
the scraper class:
from beautifulsoup import beautifulsoup import urllib2 class scrape(): """base class subclassed wrapper providers basic url fetching urllib2 , basic html parsing beautifulsoupץ useful methods provided class composition beautifulsoup. direct access soup class can use _soup property.""" def __init__(self,file): self._file = file #very basic input validation #import re #import urllib2 #from beautifulsoup import beautifulsoup try: self._page = urllib2.urlopen(self._file) #fetching page except (urllib2.urlerror): print ('please enter valid url starting http/https/ftp/file') self._soup = beautifulsoup(self._page) #calling html parser #beautifulsoup.__init__(self,self._page) # next part class compostion part - transform attribute , method calls beautifulsoup class #search functions: self.find = self._soup.find self.findall = self._soup.findall self.__iter__ = self._soup.__iter__ #enables iterating,looping in object self.__len__ = self._soup.__len__ self.__contains__ = self._soup.__contains__ #attribute fetching , setting - __getattr__ implented scraper class self.__setattr__ = self._soup.__setattr__ self.__getattribute__ = self._soup.__getattribute__ #called implement evaluation of self[key] self.__getitem__ = self._soup.__getitem__ self.__setitem__ = self._soup.__setitem__ self.__delitem__ = self._soup.__delitem__ self.__call__ = self._soup.__call__#called when instance “called” function self._getattrmap = self._soup._getattrmap self.has_key = self._soup.has_key #walking html document methods self.contents = self._soup.contents self.text = self._soup.text self.extract = self._soup.extract self.next = self._soup.next self.parent = self._soup.parent self.fetch = self._soup.fetch self.fetchtext = self._soup.fetchtext self.findallnext = self._soup.findallnext self.findchild = self._soup.findchild self.findchildren = self._soup.findchildren self.findnext = self._soup.findnext self.findnextsibling = self._soup.findnextsibling self.first = self._soup.first self.name = self._soup.name self.get = self._soup.get self.getstring = self._soup.getstring # comparison operators or similiar boolean checks self.__eq__ = self._soup.__eq__ self.__ne__ = self._soup.__ne__ self.__hash__ = self._soup.__hash__ self.__nonezero__ = self._soup.__nonzero__ #not sure # class represntation magic methods: self.__str__ = self._soup.__str__ self.__repr__ =self._soup.__repr__ #self.__dict__ = self._soup.__dict__ def __getattr__(self,method): """basically 'magic' method transforms calls unknown attributes , enables traverse html document .notation. example - using instancename.div return first div. explantion: python calls __getattr__ if didn't find method or attribute correspanding call. i'm not sure or right use method """ return self._soup.find(method) def clean(self,work=false,element=false): """clean method provides:basic cleaning of head,scripts etc input 'work' soup object clean unneccesary parts:scripts,head,style has optional variable:'element' can tuple of element enables override element clean""" self._work = work or self._soup self._cleanelements=element or ("head","style","script") #for elem in self._work.findall(self._cleanelements): elem in self.findall(self._cleanelements): elem.extract() but when subclass sort of recursion loop, can figure.
here subclass (the relevant parts):
class maintraffic(scrape): """class traffic - subclasses scrape class inputs page url , category""" def __init__(self, file, cat, caller = false): if not caller: self._file = file #import urllib2 #self._request = urllib2.request(self._file)# request post show questions scrape.__init__(self,self._file) self.pagecat = cat self.clean(self) self.cleansoup = self.cleantotable(self) self.fetchlinks(self.cleansoup) #self.populatequestiondic() #del (self.cleansoup) def cleantotable(self): pass def fetchlinks(self,fetch): pass def length(self): sqlalchemy import func self.len = session.query(func.count(question.id)).scalar() return int(self.len) def __len__(self): return self.length() def __repr__(self): self.repr = "traffic theory question, current number of questions:{0}".format(self.length()) return self.repr def __getitem__(self,key): try: self._item = session.query(question).filter_by(question_num=key).first() return self._item except (indexerror, keyerror): print "no such key:{0}".format(key) and here error message:
file "c:\python27\learn\traffic.py", line 117, in __init__ scrape.__init__(self,self._file) file "c:\python27\learn\traffic.py", line 26, in __init__ self._soup = beautifulsoup(self._page) #calling html parser file "c:\python27\learn\traffic.py", line 92, in __getattr__ return self._soup.find(method) file "c:\python27\learn\traffic.py", line 92, in __getattr__ return self._soup.find(method) file "c:\python27\learn\traffic.py", line 92, in __getattr__ return self._soup.find(method) runtimeerror: maximum recursion depth exceeded i suspect problem me misusing __getattr__, couldn't figure out should change.
part 1
your code doesn't work because __getattr__() accesses self._soup before has been initialized. happens due 4 innocuous-looking lines:
try: self._page = urllib2.urlopen(self._file) except (urllib2.urlerror): print ('please enter valid url starting http/https/ftp/file') why catch exception , not handle it?
the next line accesses self._page, has not been set yet if urlopen() threw exception:
self._soup = beautifulsoup(self._page) since hasn't been set, accessing calls __getattr__(), accesses self._soup, has not been set yet accesses __getattr__.
the easiest "fix" special-case _soup prevent infinite recursion. additionally, seems make more sense __getattr__ normal attribute lookup on soup:
def __getattr__(self,attr): if attr == "_soup": raise attributeerror() return getattr(self._soup,attr) part 2
copying methods on unlikely work well, , seems miss point of class composition entirely.
Comments
Post a Comment