summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuo Jinghua <sunmoon1997@gmail.com>2010-11-13 01:05:31 +0800
committerLuo Jinghua <sunmoon1997@gmail.com>2010-11-13 01:05:31 +0800
commit9caedccf7271d1ccfb3790b698f206b5c7958f5a (patch)
treef4320d45142dda9ed4c3cd09efef977815802700
parent64456b89c708bc5e2f64276e0cb48c3a7ed75660 (diff)
ppslist2: another ppslist
-rw-r--r--totem/plugin/ppslist2.py515
1 files changed, 515 insertions, 0 deletions
diff --git a/totem/plugin/ppslist2.py b/totem/plugin/ppslist2.py
new file mode 100644
index 0000000..8339ad4
--- /dev/null
+++ b/totem/plugin/ppslist2.py
@@ -0,0 +1,515 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+# -*- python -*-
+# Author: Luo Jinghua
+
+import urllib
+import httplib
+import htmlentitydefs
+import time
+import re
+import os
+import StringIO
+import copy
+import sys
+import time
+import zipfile
+
+from xml.dom.minidom import Document
+from xml.dom import minidom
+import BeautifulSoup
+
+GENERAS_URL = 'http://list1.ppstream.com/class/generas.xml.zip'
+SUB_URL = 'http://list1.pps.tv/class/%d.xml.zip'
+MOVIE_URL = 'http://list1.ppstv.com/schs/%d.xml.zip'
+
+def download(url, max_retry = 3, interval = 5):
+ if not url:
+ return ''
+ #print 'downloading ', url
+ for i in range (max_retry):
+ try:
+ res = urllib.urlopen (url).read ()
+ except Exception, e:
+ print "Couldn't open url: ", e
+ res = None
+ time.sleep (interval)
+ if res:
+ break
+ return res
+
+def gbk2utf8(s):
+ return unicode(s, 'gb18030', 'ignore').encode('utf-8')
+
+def xmlgbk2utf8(s):
+ s = gbk2utf8(s)
+ s = s.replace('GB18030', 'UTF-8', 1)
+ return s
+
+def unpack_zip(s):
+ f = StringIO.StringIO(s)
+ zf = zipfile.ZipFile(f, 'r')
+ r = zf.open(zf.namelist()[0]).read()
+ return r
+
+def extractString(s):
+ return unicode(s).encode('utf-8')
+
+def extractStrings(ss):
+ result = []
+ for s in ss:
+ result.append(extractString(s))
+ return result
+
+def extractNumberString(s, default = ''):
+ s = extractString(s)
+ n = re.search(r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?', s)
+ if n:
+ return n.group()
+ return default
+
+def extractNumber(s, default = 0):
+ s = extractString(s)
+ n = re.search(r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?', s)
+ if n:
+ if '.' in n.group() or 'e' in n.group() or 'E' in n.group():
+ return float(n.group())
+ return int(n.group())
+ return default
+
+def extractNavigableStrings(node):
+ result = []
+ for content in node.contents:
+ if type(content) is BeautifulSoup.NavigableString:
+ result.append(extractString(content))
+ return result
+
+class PPSClass:
+ def __init__ (self):
+ self.has_id = True
+ self.id = 0
+ self.title = ''
+ self.url = ''
+ self.orderid = 0
+ self.order = ''
+ self.seq = ''
+ self.count = 0
+ self.pages = {}
+ self.max_page = 1
+ self.parent = None
+
+ def __str__(self):
+ return 'PPSClass<%d %s %s>' % (self.id, self.title, self.count)
+
+ def __repr__(self):
+ return repr(self.dictionary())
+
+ def dictionary(self):
+ d = {}
+ d['id'] = self.id
+ d['url'] = self.url
+ d['title'] = self.title
+ d['order'] = self.order
+ d['orderid'] = self.orderid
+ d['seq'] = self.seq
+ d['count'] = self.count
+ return d
+
+ def parse (self, node):
+ self.title = unicode(node.getAttribute('name')).encode('utf-8')
+ self.url = ''
+ self.id = int(node.getAttribute('id'))
+ self.order = str(node.getAttribute('order'))
+ self.orderid = int(node.getAttribute('orderid'))
+ self.seq = str(node.getAttribute('order'))
+ opt = str(node.getAttribute('opt'))
+ opts = opt.split(';')
+ for opt in opts:
+ k, v = opt.split('=')
+ if k == 'sc':
+ self.count = int(v)
+ return self
+
+ def load(self, d):
+ self.__dict__.update(d)
+
+ def resetMovies(self):
+ self.pages = {}
+ self.max_page = 1
+
+ def addMovies(self, movies, page_id = 0):
+ movies = copy.copy(movies)
+ for movie in movies:
+ movie.parent = self
+
+ if not self.pages.has_key(page_id):
+ self.pages[page_id] = []
+ self.pages[page_id] += movies
+
+ def setMovies(self, movies, page_id = 0, clone = True):
+ if clone:
+ movies = copy.copy(movies)
+ for movie in movies:
+ movie.parent = self
+
+ self.pages[page_id] = movies
+
+ def getMovies(self):
+ pages = self.getPages()
+ movies = []
+ for page in pages:
+ movies += page
+ return movies
+
+ def getPage(self, page_id):
+ if page_id == 0 and not self.max_page:
+ return []
+ assert (page_id >= 0 and page_id < self.max_page)
+ if self.pages.has_key(page_id):
+ return self.pages[page_id]
+ return []
+
+ def getPages(self):
+ keys = self.pages.keys()
+ keys.sort()
+ pages = []
+ for key in keys:
+ pages.append(self.pages[key])
+ return pages
+
+ def getMaxPage(self):
+ return self.max_page
+
+class PPSFile:
+ elements = [ 'id', 'ci', 'size', 'url' ]
+
+ def __str__ (self):
+ return 'PPSFile<%d %s %s>' % (self.id, self.title, self.url)
+
+ def __repr__(self):
+ return repr(self.dictionary())
+
+ def __eq__ (self, other):
+ for attr in PPSFile.elements:
+ if getattr (self, attr) != getattr (other, attr):
+ #print attr, getattr (self, attr), getattr (other, attr)
+ return False
+ return True
+
+ def dictionary(self):
+ d = {}
+ d['id'] = self.id
+ d['url'] = self.url
+ d['title'] = self.title
+ return d
+
+ def __init__ (self):
+ self.id = 0
+ self.ci = 0
+ self.size = 0
+ self.title = ''
+ self.url = ''
+ self.parent = None
+
+ def parse (self, node, ci):
+ def findTexthildNode(node):
+ if node.nodeType == minidom.Node.TEXT_NODE:
+ return node
+ for child in node.childNodes:
+ print child.nodeType
+ if child.nodeType == minidom.Node.TEXT_NODE:
+ return child
+ return None
+
+ idnode = node.getElementsByTagName('ID')[0]
+ namenode = node.getElementsByTagName('Name')[0]
+ urlnode = node.getElementsByTagName('URL')[0]
+ nametext = findTexthildNode(namenode)
+ urltext = findTexthildNode(urlnode)
+ self.id = int(idnode.getAttribute('ID'))
+ if nametext:
+ self.title = unicode(nametext.data.strip()).encode('utf-8')
+ if urltext:
+ self.url = unicode(urltext.data.strip()).encode('utf-8')
+ self.ci = ci
+ return self
+
+ def load(self, d):
+ self.__dict__.update(d)
+
+class PPSMovie:
+ elements = [ 'id', 'title', 'director', 'actor',
+ 'area', 'size', 'pubtime', 'length',
+ 'lang', 'score', 'desc', 'image',
+ 'cn']
+ def __str__(self):
+ return 'PPSMovie<%d %s %s %d %s>' % (self.id, self.title,
+ self.score, self.cn,
+ self.actor)
+
+ def __eq__ (self, other):
+ for i in PPSMovie.elements:
+ if getattr (self, i) != getattr (other, i):
+ return False
+ return self.files == other.files
+
+ def __repr__(self):
+ return repr(self.dictionary())
+
+ def dictionary(self):
+ d = {}
+ for key in PPSMovie.elements:
+ d[key] = getattr(self, key)
+ return d
+
+ def __init__ (self):
+ self.id = 0
+ self.title = ''
+ self.order = ''
+ self.director = ''
+ self.actor = ''
+ self.area = ''
+ self.size = 0
+ self.pubtime = ''
+ self.length = ''
+ self.lang = ''
+ self.score = 0
+ self.desc = ''
+ self.image = ''
+ self.cn = 1
+ self.baseurl = ''
+ self.max_page = 1
+ self.files = {}
+ self.pixbuf = None
+ self.parent = None
+ self.desc_url = ''
+
+ def parse (self, node):
+ self.files = {}
+ self.pixbuf = None
+ self.image = ''
+ self.score = 0
+ self.title = unicode(node.getAttribute('name')).encode('utf-8')
+ self.cn = 1
+ self.id = int(node.getAttribute('id'))
+ self.order = str(node.getAttribute('order'))
+ self.area = ''
+ self.pubtime = ''
+ self.actor = ''
+ self.desc = ''
+ self.smallimage = self.image
+ self.seq = str(node.getAttribute('order'))
+ opt = str(node.getAttribute('op'))
+ opts = opt.split(';')
+ for opt in opts:
+ k, v = opt.split('=', 1)
+ if k == 'sc':
+ self.cn = int(v.strip("'"))
+ elif k == 'url':
+ vs = v.strip("'").split('|')
+ if len(vs) >= 3:
+ self.desc_url = vs[2]
+ elif k == 'vm':
+ self.score = float(v.strip("'"))
+ return self
+
+ def fetchMetaData(self):
+ if not self.desc_url:
+ return None
+ s = download(self.desc_url)
+ return s
+
+ def parse_area (self, node):
+ area = unicode(extractString(node.contents[-1]), 'utf-8')
+ if area.find (u':') >= 0:
+ area = area[area.find(u':') + 1:]
+ return area.encode('utf-8')
+
+ def parse_actors(self, node):
+ actors = node.findAll('a')
+ result = []
+ for actor in actors:
+ if actor.contents:
+ result.append(extractString(actor.contents[-1]))
+ result = {}.fromkeys(result).keys()
+ return result
+
+ def parseMetaDataNode(self, node):
+ self.image = extractString(node.find('img')['src'])
+ li = node.findAll('li')
+ href = li[0].find('a')
+ if len(href) >= 2:
+ self.area = self.parse_area(href[1].contents[-1])
+ self.pubtime = extractString(href[0].contents[-1])
+ self.actor = ', '.join(self.parse_actors(li[1]))
+ self.desc = extractString(li[2].contents[-1])
+ self.smallimage = self.image
+
+ def parseMetaData(self, s):
+ if not s:
+ return
+ s = gbk2utf8(s)
+ soup = BeautifulSoup.BeautifulSoup(s)
+ nodes = soup.findAll('div', { "class" : "pltr" })
+ if not nodes:
+ return
+ self.parseMetaDataNode(nodes[0])
+
+ def load(self, d):
+ self.__dict__.update(d)
+ self.smallimage = self.image
+
+ def getFiles(self, page_id = 0):
+ if page_id in self.files:
+ return self.files[page_id]
+ return []
+
+ def getAllFiles(self):
+ keys = self.files.keys()
+ keys.sort()
+ result = []
+ for key in keys:
+ result += self.files[key]
+ return result
+
+ def setFiles(self, files, page_id = 0):
+ self.files[page_id] = copy.copy(files)
+ for f in self.files[page_id]:
+ f.parent = self
+
+ def setMaxPage(self, max_page):
+ self.max_page = max_page
+
+ def getMaxPage(self):
+ return self.max_page
+
+def parseMovieClassList(res):
+ res = unpack_zip(res)
+ res = xmlgbk2utf8(res)
+ dom = minidom.parseString(res)
+ node = dom.getElementsByTagName('Gens')
+ gens = node[0].getElementsByTagName('Gen')
+ result = []
+ for gen in gens:
+ cls = PPSClass()
+ cls.parse(gen)
+ result.append(cls)
+ return result
+
+def getMovieClassList():
+ ppslist = download(GENERAS_URL)
+ return parseMovieClassList(ppslist)
+
+def getMovieList(clsid):
+ s = download(SUB_URL % clsid)
+ return parseMovieList(clsid, s)
+
+def parseMovieList(clsid, s):
+ s = unpack_zip(s)
+ s = xmlgbk2utf8(s)
+ dom = minidom.parseString(s)
+ node = dom.getElementsByTagName('Subs')
+ subs = node[0].getElementsByTagName('Sub')
+ result = []
+ for sub in subs:
+ movie = PPSMovie()
+ movie.parse(sub)
+ result.append(movie)
+ return result
+
+def parseMovieFileList(movie_id, s):
+ s = unpack_zip(s)
+ s = xmlgbk2utf8(s)
+ dom = minidom.parseString(s)
+ node = dom.getElementsByTagName('Chs')
+ chs = node[0].getElementsByTagName('Ch')
+ result = []
+ ci = 0
+ for f in chs:
+ ppsfile = PPSFile()
+ result.append(ppsfile.parse(f, ci))
+ ci += 1
+ return result
+
+def getMovieFileList(movie_id):
+ url = MOVIE_URL % movie_id
+ print url
+ s = download(url)
+ return parseMovieFileList(movie_id, s)
+
+class PPSList:
+ def __init__(self):
+ self.classes = []
+
+ def reset(self):
+ self.classes = []
+
+ def getClasses(self):
+ return self.classes
+
+ def fetchClasses(self):
+ s = download(GENERAS_URL)
+ return s
+
+ def parseClasses(self, s):
+ if s:
+ result = parseMovieClassList(s)
+ else:
+ result = []
+ return result
+
+ def updateClasses(self, classes):
+ self.classes += copy.copy(classes)
+
+ def fetchMovieList(self, movie_class, page_id = 0):
+ url = SUB_URL % movie_class
+ s = download(url)
+ return s
+
+ def searchMovieList(self, keyword, page_id = 0):
+ s = ''
+ return s
+
+ def parseMovieList(self, movie_class, s, page_id = 0):
+ result = parseMovieList(movie_class, s)
+ return result
+
+ def updateMovieList(self, movie_class, movie_list, page_id = 0):
+ if page_id == 0:
+ movie_class.movies = []
+ movie_class.movies += movie_list
+ movie_class.pages[page_id] = copy.copy(movie_list)
+
+ def fetchMovie(self, cls, movie):
+ url = MOVIE_URL % movie.id
+ s = download(url)
+ return s
+
+ def parseMovie(self, cls, movie, s):
+ result = parseMovieFileList(movie, s)
+ return result
+
+ def updateMovie(self, cls, movie, file_list, page_id = 0):
+ movie.setFiles(file_list, page_id)
+
+if __name__ == '__main__':
+ def test_get_class_list():
+ cls_list = getMovieClassList()
+ for cls in cls_list:
+ print cls
+
+ def test_get_movie_list():
+ movie_list = getMovieList(141)
+ for movie in movie_list:
+ s = movie.fetchMetaData()
+ movie.parseMetaData(s)
+ print movie
+
+ def test_get_movie_file_list():
+ file_list = getMovieFileList(13020)
+ for f in file_list:
+ print f
+
+ #test_get_class_list()
+ #test_get_movie_list()
+ #test_get_movie_file_list()