diff options
author | Luo Jinghua <sunmoon1997@gmail.com> | 2010-11-13 01:05:31 +0800 |
---|---|---|
committer | Luo Jinghua <sunmoon1997@gmail.com> | 2010-11-13 01:05:31 +0800 |
commit | 9caedccf7271d1ccfb3790b698f206b5c7958f5a (patch) | |
tree | f4320d45142dda9ed4c3cd09efef977815802700 | |
parent | 64456b89c708bc5e2f64276e0cb48c3a7ed75660 (diff) |
ppslist2: another ppslist
-rw-r--r-- | totem/plugin/ppslist2.py | 515 |
1 files changed, 515 insertions, 0 deletions
diff --git a/totem/plugin/ppslist2.py b/totem/plugin/ppslist2.py new file mode 100644 index 0000000..8339ad4 --- /dev/null +++ b/totem/plugin/ppslist2.py @@ -0,0 +1,515 @@ +#!/bin/env python +# -*- coding: utf-8 -*- +# -*- python -*- +# Author: Luo Jinghua + +import urllib +import httplib +import htmlentitydefs +import time +import re +import os +import StringIO +import copy +import sys +import time +import zipfile + +from xml.dom.minidom import Document +from xml.dom import minidom +import BeautifulSoup + +GENERAS_URL = 'http://list1.ppstream.com/class/generas.xml.zip' +SUB_URL = 'http://list1.pps.tv/class/%d.xml.zip' +MOVIE_URL = 'http://list1.ppstv.com/schs/%d.xml.zip' + +def download(url, max_retry = 3, interval = 5): + if not url: + return '' + #print 'downloading ', url + for i in range (max_retry): + try: + res = urllib.urlopen (url).read () + except Exception, e: + print "Couldn't open url: ", e + res = None + time.sleep (interval) + if res: + break + return res + +def gbk2utf8(s): + return unicode(s, 'gb18030', 'ignore').encode('utf-8') + +def xmlgbk2utf8(s): + s = gbk2utf8(s) + s = s.replace('GB18030', 'UTF-8', 1) + return s + +def unpack_zip(s): + f = StringIO.StringIO(s) + zf = zipfile.ZipFile(f, 'r') + r = zf.open(zf.namelist()[0]).read() + return r + +def extractString(s): + return unicode(s).encode('utf-8') + +def extractStrings(ss): + result = [] + for s in ss: + result.append(extractString(s)) + return result + +def extractNumberString(s, default = ''): + s = extractString(s) + n = re.search(r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?', s) + if n: + return n.group() + return default + +def extractNumber(s, default = 0): + s = extractString(s) + n = re.search(r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?', s) + if n: + if '.' in n.group() or 'e' in n.group() or 'E' in n.group(): + return float(n.group()) + return int(n.group()) + return default + +def extractNavigableStrings(node): + result = [] + for content in node.contents: + if type(content) is BeautifulSoup.NavigableString: + result.append(extractString(content)) + return result + +class PPSClass: + def __init__ (self): + self.has_id = True + self.id = 0 + self.title = '' + self.url = '' + self.orderid = 0 + self.order = '' + self.seq = '' + self.count = 0 + self.pages = {} + self.max_page = 1 + self.parent = None + + def __str__(self): + return 'PPSClass<%d %s %s>' % (self.id, self.title, self.count) + + def __repr__(self): + return repr(self.dictionary()) + + def dictionary(self): + d = {} + d['id'] = self.id + d['url'] = self.url + d['title'] = self.title + d['order'] = self.order + d['orderid'] = self.orderid + d['seq'] = self.seq + d['count'] = self.count + return d + + def parse (self, node): + self.title = unicode(node.getAttribute('name')).encode('utf-8') + self.url = '' + self.id = int(node.getAttribute('id')) + self.order = str(node.getAttribute('order')) + self.orderid = int(node.getAttribute('orderid')) + self.seq = str(node.getAttribute('order')) + opt = str(node.getAttribute('opt')) + opts = opt.split(';') + for opt in opts: + k, v = opt.split('=') + if k == 'sc': + self.count = int(v) + return self + + def load(self, d): + self.__dict__.update(d) + + def resetMovies(self): + self.pages = {} + self.max_page = 1 + + def addMovies(self, movies, page_id = 0): + movies = copy.copy(movies) + for movie in movies: + movie.parent = self + + if not self.pages.has_key(page_id): + self.pages[page_id] = [] + self.pages[page_id] += movies + + def setMovies(self, movies, page_id = 0, clone = True): + if clone: + movies = copy.copy(movies) + for movie in movies: + movie.parent = self + + self.pages[page_id] = movies + + def getMovies(self): + pages = self.getPages() + movies = [] + for page in pages: + movies += page + return movies + + def getPage(self, page_id): + if page_id == 0 and not self.max_page: + return [] + assert (page_id >= 0 and page_id < self.max_page) + if self.pages.has_key(page_id): + return self.pages[page_id] + return [] + + def getPages(self): + keys = self.pages.keys() + keys.sort() + pages = [] + for key in keys: + pages.append(self.pages[key]) + return pages + + def getMaxPage(self): + return self.max_page + +class PPSFile: + elements = [ 'id', 'ci', 'size', 'url' ] + + def __str__ (self): + return 'PPSFile<%d %s %s>' % (self.id, self.title, self.url) + + def __repr__(self): + return repr(self.dictionary()) + + def __eq__ (self, other): + for attr in PPSFile.elements: + if getattr (self, attr) != getattr (other, attr): + #print attr, getattr (self, attr), getattr (other, attr) + return False + return True + + def dictionary(self): + d = {} + d['id'] = self.id + d['url'] = self.url + d['title'] = self.title + return d + + def __init__ (self): + self.id = 0 + self.ci = 0 + self.size = 0 + self.title = '' + self.url = '' + self.parent = None + + def parse (self, node, ci): + def findTexthildNode(node): + if node.nodeType == minidom.Node.TEXT_NODE: + return node + for child in node.childNodes: + print child.nodeType + if child.nodeType == minidom.Node.TEXT_NODE: + return child + return None + + idnode = node.getElementsByTagName('ID')[0] + namenode = node.getElementsByTagName('Name')[0] + urlnode = node.getElementsByTagName('URL')[0] + nametext = findTexthildNode(namenode) + urltext = findTexthildNode(urlnode) + self.id = int(idnode.getAttribute('ID')) + if nametext: + self.title = unicode(nametext.data.strip()).encode('utf-8') + if urltext: + self.url = unicode(urltext.data.strip()).encode('utf-8') + self.ci = ci + return self + + def load(self, d): + self.__dict__.update(d) + +class PPSMovie: + elements = [ 'id', 'title', 'director', 'actor', + 'area', 'size', 'pubtime', 'length', + 'lang', 'score', 'desc', 'image', + 'cn'] + def __str__(self): + return 'PPSMovie<%d %s %s %d %s>' % (self.id, self.title, + self.score, self.cn, + self.actor) + + def __eq__ (self, other): + for i in PPSMovie.elements: + if getattr (self, i) != getattr (other, i): + return False + return self.files == other.files + + def __repr__(self): + return repr(self.dictionary()) + + def dictionary(self): + d = {} + for key in PPSMovie.elements: + d[key] = getattr(self, key) + return d + + def __init__ (self): + self.id = 0 + self.title = '' + self.order = '' + self.director = '' + self.actor = '' + self.area = '' + self.size = 0 + self.pubtime = '' + self.length = '' + self.lang = '' + self.score = 0 + self.desc = '' + self.image = '' + self.cn = 1 + self.baseurl = '' + self.max_page = 1 + self.files = {} + self.pixbuf = None + self.parent = None + self.desc_url = '' + + def parse (self, node): + self.files = {} + self.pixbuf = None + self.image = '' + self.score = 0 + self.title = unicode(node.getAttribute('name')).encode('utf-8') + self.cn = 1 + self.id = int(node.getAttribute('id')) + self.order = str(node.getAttribute('order')) + self.area = '' + self.pubtime = '' + self.actor = '' + self.desc = '' + self.smallimage = self.image + self.seq = str(node.getAttribute('order')) + opt = str(node.getAttribute('op')) + opts = opt.split(';') + for opt in opts: + k, v = opt.split('=', 1) + if k == 'sc': + self.cn = int(v.strip("'")) + elif k == 'url': + vs = v.strip("'").split('|') + if len(vs) >= 3: + self.desc_url = vs[2] + elif k == 'vm': + self.score = float(v.strip("'")) + return self + + def fetchMetaData(self): + if not self.desc_url: + return None + s = download(self.desc_url) + return s + + def parse_area (self, node): + area = unicode(extractString(node.contents[-1]), 'utf-8') + if area.find (u':') >= 0: + area = area[area.find(u':') + 1:] + return area.encode('utf-8') + + def parse_actors(self, node): + actors = node.findAll('a') + result = [] + for actor in actors: + if actor.contents: + result.append(extractString(actor.contents[-1])) + result = {}.fromkeys(result).keys() + return result + + def parseMetaDataNode(self, node): + self.image = extractString(node.find('img')['src']) + li = node.findAll('li') + href = li[0].find('a') + if len(href) >= 2: + self.area = self.parse_area(href[1].contents[-1]) + self.pubtime = extractString(href[0].contents[-1]) + self.actor = ', '.join(self.parse_actors(li[1])) + self.desc = extractString(li[2].contents[-1]) + self.smallimage = self.image + + def parseMetaData(self, s): + if not s: + return + s = gbk2utf8(s) + soup = BeautifulSoup.BeautifulSoup(s) + nodes = soup.findAll('div', { "class" : "pltr" }) + if not nodes: + return + self.parseMetaDataNode(nodes[0]) + + def load(self, d): + self.__dict__.update(d) + self.smallimage = self.image + + def getFiles(self, page_id = 0): + if page_id in self.files: + return self.files[page_id] + return [] + + def getAllFiles(self): + keys = self.files.keys() + keys.sort() + result = [] + for key in keys: + result += self.files[key] + return result + + def setFiles(self, files, page_id = 0): + self.files[page_id] = copy.copy(files) + for f in self.files[page_id]: + f.parent = self + + def setMaxPage(self, max_page): + self.max_page = max_page + + def getMaxPage(self): + return self.max_page + +def parseMovieClassList(res): + res = unpack_zip(res) + res = xmlgbk2utf8(res) + dom = minidom.parseString(res) + node = dom.getElementsByTagName('Gens') + gens = node[0].getElementsByTagName('Gen') + result = [] + for gen in gens: + cls = PPSClass() + cls.parse(gen) + result.append(cls) + return result + +def getMovieClassList(): + ppslist = download(GENERAS_URL) + return parseMovieClassList(ppslist) + +def getMovieList(clsid): + s = download(SUB_URL % clsid) + return parseMovieList(clsid, s) + +def parseMovieList(clsid, s): + s = unpack_zip(s) + s = xmlgbk2utf8(s) + dom = minidom.parseString(s) + node = dom.getElementsByTagName('Subs') + subs = node[0].getElementsByTagName('Sub') + result = [] + for sub in subs: + movie = PPSMovie() + movie.parse(sub) + result.append(movie) + return result + +def parseMovieFileList(movie_id, s): + s = unpack_zip(s) + s = xmlgbk2utf8(s) + dom = minidom.parseString(s) + node = dom.getElementsByTagName('Chs') + chs = node[0].getElementsByTagName('Ch') + result = [] + ci = 0 + for f in chs: + ppsfile = PPSFile() + result.append(ppsfile.parse(f, ci)) + ci += 1 + return result + +def getMovieFileList(movie_id): + url = MOVIE_URL % movie_id + print url + s = download(url) + return parseMovieFileList(movie_id, s) + +class PPSList: + def __init__(self): + self.classes = [] + + def reset(self): + self.classes = [] + + def getClasses(self): + return self.classes + + def fetchClasses(self): + s = download(GENERAS_URL) + return s + + def parseClasses(self, s): + if s: + result = parseMovieClassList(s) + else: + result = [] + return result + + def updateClasses(self, classes): + self.classes += copy.copy(classes) + + def fetchMovieList(self, movie_class, page_id = 0): + url = SUB_URL % movie_class + s = download(url) + return s + + def searchMovieList(self, keyword, page_id = 0): + s = '' + return s + + def parseMovieList(self, movie_class, s, page_id = 0): + result = parseMovieList(movie_class, s) + return result + + def updateMovieList(self, movie_class, movie_list, page_id = 0): + if page_id == 0: + movie_class.movies = [] + movie_class.movies += movie_list + movie_class.pages[page_id] = copy.copy(movie_list) + + def fetchMovie(self, cls, movie): + url = MOVIE_URL % movie.id + s = download(url) + return s + + def parseMovie(self, cls, movie, s): + result = parseMovieFileList(movie, s) + return result + + def updateMovie(self, cls, movie, file_list, page_id = 0): + movie.setFiles(file_list, page_id) + +if __name__ == '__main__': + def test_get_class_list(): + cls_list = getMovieClassList() + for cls in cls_list: + print cls + + def test_get_movie_list(): + movie_list = getMovieList(141) + for movie in movie_list: + s = movie.fetchMetaData() + movie.parseMetaData(s) + print movie + + def test_get_movie_file_list(): + file_list = getMovieFileList(13020) + for f in file_list: + print f + + #test_get_class_list() + #test_get_movie_list() + #test_get_movie_file_list() |