diff options
author | Luo Jinghua <sunmoon1997@gmail.com> | 2010-11-14 14:57:24 +0800 |
---|---|---|
committer | Luo Jinghua <sunmoon1997@gmail.com> | 2010-11-14 14:57:24 +0800 |
commit | 3f12a9fc8e57c568066a9ea2698b118b975c9df3 (patch) | |
tree | 31e165dd32fecabcc679e1b36937bcbcd8bf57cb | |
parent | 9caedccf7271d1ccfb3790b698f206b5c7958f5a (diff) |
ppslists: add support for parsing baike page
-rw-r--r-- | totem/plugin/ppslist2.py | 392 |
1 files changed, 223 insertions, 169 deletions
diff --git a/totem/plugin/ppslist2.py b/totem/plugin/ppslist2.py index 8339ad4..8d3053f 100644 --- a/totem/plugin/ppslist2.py +++ b/totem/plugin/ppslist2.py @@ -14,6 +14,8 @@ import copy import sys import time import zipfile +import urllib2 +import urllib2cache from xml.dom.minidom import Document from xml.dom import minidom @@ -23,13 +25,16 @@ GENERAS_URL = 'http://list1.ppstream.com/class/generas.xml.zip' SUB_URL = 'http://list1.pps.tv/class/%d.xml.zip' MOVIE_URL = 'http://list1.ppstv.com/schs/%d.xml.zip' +CachePath = "/tmp/ppslist2" +opener = urllib2.build_opener(urllib2cache.CacheHandler(CachePath)) + def download(url, max_retry = 3, interval = 5): if not url: return '' #print 'downloading ', url for i in range (max_retry): try: - res = urllib.urlopen (url).read () + res = opener.open (url).read () except Exception, e: print "Couldn't open url: ", e res = None @@ -82,6 +87,81 @@ def extractNavigableStrings(node): for content in node.contents: if type(content) is BeautifulSoup.NavigableString: result.append(extractString(content)) + elif hasattr(content, 'contents'): + result += extractNavigableStrings(content) + return result + +def parseOptionList(s): + r = {} + opts = s.split(';') + for opt in opts: + l = opt.split('=', 1) + if len(l) < 2: + continue + k = l[0] + v = l[1] + v = v.strip("'") + r[k] = v + return r + +def parseArea (node): + area = unicode(extractString(node.contents[-1]), 'utf-8') + if area.find (u':') >= 0: + area = area[area.find(u':') + 1:] + return area.encode('utf-8') + +def parseActors(node): + actors = node.findAll('a') + result = [] + for actor in actors: + if actor.contents: + result.append(extractString(actor.contents[-1])) + result = {}.fromkeys(result).keys() + return result + +def parseMetaDataNode(node): + result = {} + result['image'] = extractString(node.find('img')['src']) + li = node.findAll('li') + href = li[0].find('a') + if len(href) >= 2: + result['area'] = parseArea(href[1].contents[-1]) + result['pubtime'] = extractString(href[0].contents[-1]) + else: + result['area'] = '' + result['pubtime'] = '' + result['actor'] = ', '.join(parseActors(li[1])) + result['desc'] = extractString(li[2].contents[-1]) + return result + +def parseBKMetaDataNode(node): + result = {} + #print node + details = node.find('div', { 'id': 'bk_details' }) + if not details: + return result + result['image'] = extractString(details.find('img')['src']).replace('/small/', '/navi/') + tfile = details.find('div', { 'class': 'tfile'}) + li = tfile.findAll('li') + actors = extractNavigableStrings(li[0]) + if len(li) > 3: + pubtime = extractString(li[3].contents[-1]).split(':')[1].strip() + else: + pubtime = '' + if len(li) > 4: + areas = extractNavigableStrings(li[4].find('span')) + areas = (''.join(areas[1:])).split(':')[1].strip() + else: + areas = '' + descnode = node.find('div', { 'class' : "boxIn", 'id' : "drama_desc" }) + if descnode: + desc = ''.join(extractNavigableStrings(descnode)) + else: + desc = '' + result['area'] = areas + result['pubtime'] = pubtime + result['actor'] = ''.join(actors[2:]) + result['desc'] = desc return result class PPSClass: @@ -94,9 +174,13 @@ class PPSClass: self.order = '' self.seq = '' self.count = 0 + self.score = 0 + self.desc_url = '' + self.meta_data = {} self.pages = {} self.max_page = 1 self.parent = None + self.children = [] def __str__(self): return 'PPSClass<%d %s %s>' % (self.id, self.title, self.count) @@ -113,6 +197,9 @@ class PPSClass: d['orderid'] = self.orderid d['seq'] = self.seq d['count'] = self.count + d['score'] = self.score + d['desc_url'] = self.desc_url + d['meta_data'] = self.meta_data return d def parse (self, node): @@ -120,16 +207,41 @@ class PPSClass: self.url = '' self.id = int(node.getAttribute('id')) self.order = str(node.getAttribute('order')) - self.orderid = int(node.getAttribute('orderid')) - self.seq = str(node.getAttribute('order')) - opt = str(node.getAttribute('opt')) - opts = opt.split(';') - for opt in opts: - k, v = opt.split('=') + if node.hasAttribute('orderid'): + self.orderid = int(node.getAttribute('orderid')) + self.seq = str(node.getAttribute('seq')) + if node.hasAttribute('opt'): + opt = unicode(node.getAttribute('opt')).encode('utf-8') + else: + opt = unicode(node.getAttribute('op')).encode('utf-8') + opts = parseOptionList(opt) + for k in opts: + v = opts[k] if k == 'sc': self.count = int(v) + elif k == 'url': + vs = v.split('|') + if len(vs) >= 3: + self.desc_url = vs[2] + elif k == 'vm': + self.score = float(v or '0.0') return self + def parseMetaData(self, s): + if not s: + return {} + s = gbk2utf8(s) + soup = BeautifulSoup.BeautifulSoup(s) + nodes = soup.findAll('div', { "class" : "pltr" }) + if not nodes: + return + self.meta_data = parseMetaDataNode(nodes[0]) + + def fetchMetaData(self): + if not self.desc_url: + return None + return download(self.desc_url) + def load(self, d): self.__dict__.update(d) @@ -180,72 +292,16 @@ class PPSClass: def getMaxPage(self): return self.max_page -class PPSFile: - elements = [ 'id', 'ci', 'size', 'url' ] - - def __str__ (self): - return 'PPSFile<%d %s %s>' % (self.id, self.title, self.url) - - def __repr__(self): - return repr(self.dictionary()) - - def __eq__ (self, other): - for attr in PPSFile.elements: - if getattr (self, attr) != getattr (other, attr): - #print attr, getattr (self, attr), getattr (other, attr) - return False - return True - - def dictionary(self): - d = {} - d['id'] = self.id - d['url'] = self.url - d['title'] = self.title - return d - - def __init__ (self): - self.id = 0 - self.ci = 0 - self.size = 0 - self.title = '' - self.url = '' - self.parent = None - - def parse (self, node, ci): - def findTexthildNode(node): - if node.nodeType == minidom.Node.TEXT_NODE: - return node - for child in node.childNodes: - print child.nodeType - if child.nodeType == minidom.Node.TEXT_NODE: - return child - return None - - idnode = node.getElementsByTagName('ID')[0] - namenode = node.getElementsByTagName('Name')[0] - urlnode = node.getElementsByTagName('URL')[0] - nametext = findTexthildNode(namenode) - urltext = findTexthildNode(urlnode) - self.id = int(idnode.getAttribute('ID')) - if nametext: - self.title = unicode(nametext.data.strip()).encode('utf-8') - if urltext: - self.url = unicode(urltext.data.strip()).encode('utf-8') - self.ci = ci - return self - - def load(self, d): - self.__dict__.update(d) + def setCount(self, count): + self.count = count class PPSMovie: elements = [ 'id', 'title', 'director', 'actor', 'area', 'size', 'pubtime', 'length', - 'lang', 'score', 'desc', 'image', - 'cn'] + 'lang', 'score', 'desc', 'image'] def __str__(self): - return 'PPSMovie<%d %s %s %d %s>' % (self.id, self.title, - self.score, self.cn, - self.actor) + return 'PPSMovie<%d %s %s %s>' % (self.id, self.title, + self.score, self.actor) def __eq__ (self, other): for i in PPSMovie.elements: @@ -264,6 +320,7 @@ class PPSMovie: def __init__ (self): self.id = 0 + self.bkid = 0 self.title = '' self.order = '' self.director = '' @@ -276,74 +333,69 @@ class PPSMovie: self.score = 0 self.desc = '' self.image = '' - self.cn = 1 - self.baseurl = '' + self.url = '' self.max_page = 1 self.files = {} self.pixbuf = None - self.parent = None self.desc_url = '' + self.meta_data = {} + self.parent = None def parse (self, node): self.files = {} self.pixbuf = None self.image = '' self.score = 0 - self.title = unicode(node.getAttribute('name')).encode('utf-8') - self.cn = 1 - self.id = int(node.getAttribute('id')) - self.order = str(node.getAttribute('order')) self.area = '' self.pubtime = '' self.actor = '' self.desc = '' - self.smallimage = self.image - self.seq = str(node.getAttribute('order')) - opt = str(node.getAttribute('op')) - opts = opt.split(';') - for opt in opts: - k, v = opt.split('=', 1) - if k == 'sc': - self.cn = int(v.strip("'")) - elif k == 'url': - vs = v.strip("'").split('|') - if len(vs) >= 3: - self.desc_url = vs[2] - elif k == 'vm': - self.score = float(v.strip("'")) + + def findTexthildNode(node): + if node.nodeType == minidom.Node.TEXT_NODE: + return node + for child in node.childNodes: + if child.nodeType == minidom.Node.TEXT_NODE: + return child + return None + + idnode = node.getElementsByTagName('ID')[0] + namenode = node.getElementsByTagName('Name')[0] + urlnode = node.getElementsByTagName('URL')[0] + nametext = findTexthildNode(namenode) + urltext = findTexthildNode(urlnode) + self.id = int(idnode.getAttribute('ID')) + self.score = float(idnode.getAttribute('VM') or '0.0') + self.bkid = int(idnode.getAttribute('BKID')) + if nametext: + self.title = unicode(nametext.data.strip()).encode('utf-8') + if urltext: + self.url = unicode(urltext.data.strip()).encode('utf-8') return self def fetchMetaData(self): - if not self.desc_url: + if not self.parent: return None - s = download(self.desc_url) + if self.parent.desc_url: + url = self.parent.desc_url + else: + url = 'http://kan.pps.tv/detail/%d.html?c_id=%d' % (self.id, self.parent.id) + s = download(url) return s - def parse_area (self, node): - area = unicode(extractString(node.contents[-1]), 'utf-8') - if area.find (u':') >= 0: - area = area[area.find(u':') + 1:] - return area.encode('utf-8') - - def parse_actors(self, node): - actors = node.findAll('a') - result = [] - for actor in actors: - if actor.contents: - result.append(extractString(actor.contents[-1])) - result = {}.fromkeys(result).keys() - return result + def fetchBKMetaData(self): + url = 'http://bk.pps.tv/ct%d/' % self.bkid + s = download(url) + return s - def parseMetaDataNode(self, node): - self.image = extractString(node.find('img')['src']) - li = node.findAll('li') - href = li[0].find('a') - if len(href) >= 2: - self.area = self.parse_area(href[1].contents[-1]) - self.pubtime = extractString(href[0].contents[-1]) - self.actor = ', '.join(self.parse_actors(li[1])) - self.desc = extractString(li[2].contents[-1]) - self.smallimage = self.image + def updateMetaData(self, meta_data): + if meta_data: + self.actor = meta_data['actor'] + self.desc = meta_data['desc'] + self.area = meta_data['area'] + self.pubtime = meta_data['pubtime'] + self.image = meta_data['image'] + self.meta_data = meta_data def parseMetaData(self, s): if not s: @@ -353,30 +405,24 @@ class PPSMovie: nodes = soup.findAll('div', { "class" : "pltr" }) if not nodes: return - self.parseMetaDataNode(nodes[0]) + meta_data = parseMetaDataNode(nodes[0]) + self.updateMetaData(meta_data) + + def parseBKMetaData(self, s): + if not s: + return + s = gbk2utf8(s) + soup = BeautifulSoup.BeautifulSoup(s) + nodes = soup.findAll('div', { "id" : "container" }) + if not nodes: + return + meta_data = parseBKMetaDataNode(nodes[0]) + self.updateMetaData(meta_data) def load(self, d): self.__dict__.update(d) self.smallimage = self.image - def getFiles(self, page_id = 0): - if page_id in self.files: - return self.files[page_id] - return [] - - def getAllFiles(self): - keys = self.files.keys() - keys.sort() - result = [] - for key in keys: - result += self.files[key] - return result - - def setFiles(self, files, page_id = 0): - self.files[page_id] = copy.copy(files) - for f in self.files[page_id]: - f.parent = self - def setMaxPage(self, max_page): self.max_page = max_page @@ -400,11 +446,7 @@ def getMovieClassList(): ppslist = download(GENERAS_URL) return parseMovieClassList(ppslist) -def getMovieList(clsid): - s = download(SUB_URL % clsid) - return parseMovieList(clsid, s) - -def parseMovieList(clsid, s): +def parseMovieSubclassList(movie_cls, s): s = unpack_zip(s) s = xmlgbk2utf8(s) dom = minidom.parseString(s) @@ -412,12 +454,17 @@ def parseMovieList(clsid, s): subs = node[0].getElementsByTagName('Sub') result = [] for sub in subs: - movie = PPSMovie() + movie = PPSClass() movie.parse(sub) + movie.parent = movie_cls result.append(movie) return result -def parseMovieFileList(movie_id, s): +def getMovieSubclassList(cls): + s = download(SUB_URL % cls.id) + return parseMovieSubclassList(cls, s) + +def parseMovieList(movie_id, s): s = unpack_zip(s) s = xmlgbk2utf8(s) dom = minidom.parseString(s) @@ -426,16 +473,15 @@ def parseMovieFileList(movie_id, s): result = [] ci = 0 for f in chs: - ppsfile = PPSFile() - result.append(ppsfile.parse(f, ci)) + movie = PPSMovie() + result.append(movie.parse(f)) ci += 1 return result -def getMovieFileList(movie_id): +def getMovieList(movie_id): url = MOVIE_URL % movie_id - print url s = download(url) - return parseMovieFileList(movie_id, s) + return parseMovieList(movie_id, s) class PPSList: def __init__(self): @@ -458,11 +504,21 @@ class PPSList: result = [] return result + def fetchSubclasses(self, movie_class): + s = download(SUB_URL % movie_class.id) + return s + + def parseSubclasses(self, movie_class, s): + return parseMovieSubclassList(movie_class, s) + def updateClasses(self, classes): self.classes += copy.copy(classes) + def updateSubclasses(self, movie_class, subclasses): + movie_class.children += copy.copy(subclasses) + def fetchMovieList(self, movie_class, page_id = 0): - url = SUB_URL % movie_class + url = MOVIE_URL % movie_class.id s = download(url) return s @@ -480,36 +536,34 @@ class PPSList: movie_class.movies += movie_list movie_class.pages[page_id] = copy.copy(movie_list) - def fetchMovie(self, cls, movie): - url = MOVIE_URL % movie.id - s = download(url) - return s - - def parseMovie(self, cls, movie, s): - result = parseMovieFileList(movie, s) - return result - - def updateMovie(self, cls, movie, file_list, page_id = 0): - movie.setFiles(file_list, page_id) - if __name__ == '__main__': def test_get_class_list(): cls_list = getMovieClassList() for cls in cls_list: print cls + subcls_list = getMovieSubclassList(cls) + for subcls in subcls_list: + print '\t', subcls + + def test_get_subclass_list(): + s = download(SUB_URL % 22) + subcls_list = parseMovieSubclassList(None, s) + for subcls in subcls_list: + s = subcls.fetchMetaData() + subcls.parseMetaData(s) + print subcls def test_get_movie_list(): - movie_list = getMovieList(141) - for movie in movie_list: - s = movie.fetchMetaData() - movie.parseMetaData(s) - print movie - - def test_get_movie_file_list(): - file_list = getMovieFileList(13020) + #http://kan.pps.tv/detail/313410.html?c_id=13020 + file_list = getMovieList(13020) for f in file_list: print f + s = f.fetchBKMetaData() + f.parseBKMetaData(s) + print 'meta_data:', f.meta_data + if s: + break #test_get_class_list() - #test_get_movie_list() - #test_get_movie_file_list() + #test_get_subclass_list() + test_get_movie_list() |