ppslist2: another ppslist

author: Luo Jinghua <sunmoon1997@gmail.com> 2010-11-13 01:05:31 +0800
committer: Luo Jinghua <sunmoon1997@gmail.com> 2010-11-13 01:05:31 +0800
commit: 9caedccf7271d1ccfb3790b698f206b5c7958f5a (patch)
tree: f4320d45142dda9ed4c3cd09efef977815802700
parent: 64456b89c708bc5e2f64276e0cb48c3a7ed75660 (diff)
1 files changed, 515 insertions, 0 deletions
diff --git a/totem/plugin/ppslist2.py b/totem/plugin/ppslist2.py
new file mode 100644
index 0000000..8339ad4
--- /dev/null
+++ b/totem/plugin/ppslist2.py
@@ -0,0 +1,515 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+# -*- python -*-
+# Author: Luo Jinghua
+
+import urllib
+import httplib
+import htmlentitydefs
+import time
+import re
+import os
+import StringIO
+import copy
+import sys
+import time
+import zipfile
+
+from xml.dom.minidom import Document
+from xml.dom import minidom
+import BeautifulSoup
+
+GENERAS_URL = 'http://list1.ppstream.com/class/generas.xml.zip'
+SUB_URL = 'http://list1.pps.tv/class/%d.xml.zip'
+MOVIE_URL = 'http://list1.ppstv.com/schs/%d.xml.zip'
+
+def download(url, max_retry = 3, interval = 5):
+    if not url:
+        return ''
+    #print 'downloading ', url
+    for i in range (max_retry):
+        try:
+            res = urllib.urlopen (url).read ()
+        except Exception, e:
+            print "Couldn't open url: ", e
+            res = None
+            time.sleep (interval)
+        if res:
+            break
+    return res
+
+def gbk2utf8(s):
+    return unicode(s, 'gb18030', 'ignore').encode('utf-8')
+
+def xmlgbk2utf8(s):
+    s = gbk2utf8(s)
+    s = s.replace('GB18030', 'UTF-8', 1)
+    return s
+
+def unpack_zip(s):
+    f = StringIO.StringIO(s)
+    zf = zipfile.ZipFile(f, 'r')
+    r = zf.open(zf.namelist()[0]).read()
+    return r
+
+def extractString(s):
+    return unicode(s).encode('utf-8')
+
+def extractStrings(ss):
+    result = []
+    for s in ss:
+        result.append(extractString(s))
+    return result
+
+def extractNumberString(s, default = ''):
+    s = extractString(s)
+    n = re.search(r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?', s)
+    if n:
+        return n.group()
+    return default
+
+def extractNumber(s, default = 0):
+    s = extractString(s)
+    n = re.search(r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?', s)
+    if n:
+        if '.' in n.group() or 'e' in n.group() or 'E' in n.group():
+            return float(n.group())
+        return int(n.group())
+    return default
+
+def extractNavigableStrings(node):
+    result = []
+    for content in node.contents:
+        if type(content) is BeautifulSoup.NavigableString:
+            result.append(extractString(content))
+    return result
+
+class PPSClass:
+    def __init__ (self):
+        self.has_id = True
+        self.id = 0
+        self.title = ''
+        self.url = ''
+        self.orderid = 0
+        self.order = ''
+        self.seq = ''
+        self.count = 0
+        self.pages = {}
+        self.max_page = 1
+        self.parent = None
+
+    def __str__(self):
+        return 'PPSClass<%d %s %s>' % (self.id, self.title, self.count)
+
+    def __repr__(self):
+        return repr(self.dictionary())
+
+    def dictionary(self):
+        d = {}
+        d['id'] = self.id
+        d['url'] = self.url
+        d['title'] = self.title
+        d['order'] = self.order
+        d['orderid'] = self.orderid
+        d['seq'] = self.seq
+        d['count'] = self.count
+        return d
+
+    def parse (self, node):
+        self.title = unicode(node.getAttribute('name')).encode('utf-8')
+        self.url = ''
+        self.id = int(node.getAttribute('id'))
+        self.order = str(node.getAttribute('order'))
+        self.orderid = int(node.getAttribute('orderid'))
+        self.seq = str(node.getAttribute('order'))
+        opt = str(node.getAttribute('opt'))
+        opts = opt.split(';')
+        for opt in opts:
+            k, v = opt.split('=')
+            if k == 'sc':
+                self.count = int(v)
+        return self
+
+    def load(self, d):
+        self.__dict__.update(d)
+
+    def resetMovies(self):
+        self.pages = {}
+        self.max_page = 1
+
+    def addMovies(self, movies, page_id = 0):
+        movies = copy.copy(movies)
+        for movie in movies:
+            movie.parent = self
+
+        if not self.pages.has_key(page_id):
+            self.pages[page_id] = []
+        self.pages[page_id] += movies
+
+    def setMovies(self, movies, page_id = 0, clone = True):
+        if clone:
+            movies = copy.copy(movies)
+        for movie in movies:
+            movie.parent = self
+
+        self.pages[page_id] = movies
+
+    def getMovies(self):
+        pages = self.getPages()
+        movies = []
+        for page in pages:
+            movies += page
+        return movies
+
+    def getPage(self, page_id):
+        if page_id == 0 and not self.max_page:
+	    return []
+        assert (page_id >= 0 and page_id < self.max_page)
+        if self.pages.has_key(page_id):
+            return self.pages[page_id]
+        return []
+
+    def getPages(self):
+        keys = self.pages.keys()
+        keys.sort()
+        pages = []
+        for key in keys:
+            pages.append(self.pages[key])
+        return pages
+
+    def getMaxPage(self):
+        return self.max_page
+
+class PPSFile:
+    elements = [ 'id', 'ci', 'size', 'url' ]
+
+    def __str__ (self):
+        return 'PPSFile<%d %s %s>' % (self.id, self.title, self.url)
+
+    def __repr__(self):
+        return repr(self.dictionary())
+
+    def __eq__ (self, other):
+        for attr in PPSFile.elements:
+            if getattr (self, attr) != getattr (other, attr):
+                #print attr, getattr (self, attr), getattr (other, attr)
+                return False
+        return True
+
+    def dictionary(self):
+        d = {}
+        d['id'] = self.id
+        d['url'] = self.url
+        d['title'] = self.title
+        return d
+
+    def __init__ (self):
+        self.id = 0
+        self.ci = 0
+        self.size = 0
+        self.title = ''
+        self.url = ''
+        self.parent = None
+
+    def parse (self, node, ci):
+        def findTexthildNode(node):
+            if node.nodeType == minidom.Node.TEXT_NODE:
+                return node
+            for child in node.childNodes:
+                print child.nodeType
+                if child.nodeType == minidom.Node.TEXT_NODE:
+                    return child
+            return None
+
+        idnode = node.getElementsByTagName('ID')[0]
+        namenode = node.getElementsByTagName('Name')[0]
+        urlnode = node.getElementsByTagName('URL')[0]
+        nametext = findTexthildNode(namenode)
+        urltext = findTexthildNode(urlnode)
+        self.id = int(idnode.getAttribute('ID'))
+        if nametext:
+            self.title = unicode(nametext.data.strip()).encode('utf-8')
+        if urltext:
+            self.url = unicode(urltext.data.strip()).encode('utf-8')
+        self.ci = ci
+        return self
+
+    def load(self, d):
+        self.__dict__.update(d)
+
+class PPSMovie:
+    elements = [ 'id', 'title', 'director', 'actor',
+                 'area', 'size', 'pubtime', 'length',
+                 'lang', 'score', 'desc', 'image',
+                 'cn']
+    def __str__(self):
+        return 'PPSMovie<%d %s %s %d %s>' % (self.id, self.title,
+                                             self.score, self.cn,
+                                             self.actor)
+
+    def __eq__ (self, other):
+        for i in PPSMovie.elements:
+            if getattr (self, i) != getattr (other, i):
+                return False
+        return self.files == other.files
+
+    def __repr__(self):
+        return repr(self.dictionary())
+
+    def dictionary(self):
+        d = {}
+        for key in PPSMovie.elements:
+            d[key] = getattr(self, key)
+        return d
+
+    def __init__ (self):
+        self.id = 0
+        self.title = ''
+        self.order = ''
+        self.director = ''
+        self.actor = ''
+        self.area = ''
+        self.size = 0
+        self.pubtime = ''
+        self.length = ''
+        self.lang = ''
+        self.score = 0
+        self.desc = ''
+        self.image = ''
+        self.cn = 1
+        self.baseurl = ''
+        self.max_page = 1
+        self.files = {}
+        self.pixbuf = None
+        self.parent = None
+        self.desc_url = ''
+
+    def parse (self, node):
+        self.files = {}
+        self.pixbuf = None
+        self.image = ''
+        self.score = 0
+        self.title = unicode(node.getAttribute('name')).encode('utf-8')
+        self.cn = 1
+        self.id = int(node.getAttribute('id'))
+        self.order = str(node.getAttribute('order'))
+        self.area = ''
+        self.pubtime = ''
+        self.actor = ''
+        self.desc = ''
+        self.smallimage = self.image
+        self.seq = str(node.getAttribute('order'))
+        opt = str(node.getAttribute('op'))
+        opts = opt.split(';')
+        for opt in opts:
+            k, v = opt.split('=', 1)
+            if k == 'sc':
+                self.cn = int(v.strip("'"))
+            elif k == 'url':
+                vs = v.strip("'").split('|')
+                if len(vs) >= 3:
+                    self.desc_url = vs[2]
+            elif k == 'vm':
+                self.score = float(v.strip("'"))
+        return self
+
+    def fetchMetaData(self):
+        if not self.desc_url:
+            return None
+        s = download(self.desc_url)
+        return s
+
+    def parse_area (self, node):
+        area = unicode(extractString(node.contents[-1]), 'utf-8')
+        if area.find (u'：') >= 0:
+            area = area[area.find(u'：') + 1:]
+        return area.encode('utf-8')
+
+    def parse_actors(self, node):
+        actors = node.findAll('a')
+        result = []
+        for actor in actors:
+            if actor.contents:
+                result.append(extractString(actor.contents[-1]))
+        result = {}.fromkeys(result).keys()
+        return result
+
+    def parseMetaDataNode(self, node):
+        self.image = extractString(node.find('img')['src'])
+        li = node.findAll('li')
+        href = li[0].find('a')
+        if len(href) >= 2:
+            self.area = self.parse_area(href[1].contents[-1])
+            self.pubtime = extractString(href[0].contents[-1])
+        self.actor = ', '.join(self.parse_actors(li[1]))
+        self.desc = extractString(li[2].contents[-1])
+        self.smallimage = self.image
+
+    def parseMetaData(self, s):
+        if not s:
+            return
+        s = gbk2utf8(s)
+        soup = BeautifulSoup.BeautifulSoup(s)
+        nodes = soup.findAll('div', { "class" : "pltr" })
+        if not nodes:
+            return
+        self.parseMetaDataNode(nodes[0])
+
+    def load(self, d):
+        self.__dict__.update(d)
+        self.smallimage = self.image
+
+    def getFiles(self, page_id = 0):
+        if page_id in self.files:
+            return self.files[page_id]
+        return []
+
+    def getAllFiles(self):
+        keys = self.files.keys()
+        keys.sort()
+        result = []
+        for key in keys:
+            result += self.files[key]
+        return result
+
+    def setFiles(self, files, page_id = 0):
+        self.files[page_id] = copy.copy(files)
+        for f in self.files[page_id]:
+            f.parent = self
+
+    def setMaxPage(self, max_page):
+        self.max_page = max_page
+
+    def getMaxPage(self):
+        return self.max_page
+
+def parseMovieClassList(res):
+    res = unpack_zip(res)
+    res = xmlgbk2utf8(res)
+    dom = minidom.parseString(res)
+    node = dom.getElementsByTagName('Gens')
+    gens = node[0].getElementsByTagName('Gen')
+    result = []
+    for gen in gens:
+        cls = PPSClass()
+        cls.parse(gen)
+        result.append(cls)
+    return result
+
+def getMovieClassList():
+    ppslist = download(GENERAS_URL)
+    return parseMovieClassList(ppslist)
+
+def getMovieList(clsid):
+    s = download(SUB_URL % clsid)
+    return parseMovieList(clsid, s)
+
+def parseMovieList(clsid, s):
+    s = unpack_zip(s)
+    s = xmlgbk2utf8(s)
+    dom = minidom.parseString(s)
+    node = dom.getElementsByTagName('Subs')
+    subs = node[0].getElementsByTagName('Sub')
+    result = []
+    for sub in subs:
+        movie = PPSMovie()
+        movie.parse(sub)
+        result.append(movie)
+    return result
+
+def parseMovieFileList(movie_id, s):
+    s = unpack_zip(s)
+    s = xmlgbk2utf8(s)
+    dom = minidom.parseString(s)
+    node = dom.getElementsByTagName('Chs')
+    chs = node[0].getElementsByTagName('Ch')
+    result = []
+    ci = 0
+    for f in chs:
+        ppsfile = PPSFile()
+        result.append(ppsfile.parse(f, ci))
+        ci += 1
+    return result
+
+def getMovieFileList(movie_id):
+    url = MOVIE_URL % movie_id
+    print url
+    s = download(url)
+    return parseMovieFileList(movie_id, s)
+
+class PPSList:
+    def __init__(self):
+        self.classes = []
+
+    def reset(self):
+        self.classes = []
+
+    def getClasses(self):
+        return self.classes
+
+    def fetchClasses(self):
+        s = download(GENERAS_URL)
+        return s
+
+    def parseClasses(self, s):
+        if s:
+            result = parseMovieClassList(s)
+        else:
+            result = []
+        return result
+
+    def updateClasses(self, classes):
+        self.classes += copy.copy(classes)
+
+    def fetchMovieList(self, movie_class, page_id = 0):
+        url = SUB_URL % movie_class
+        s = download(url)
+        return s
+
+    def searchMovieList(self, keyword, page_id = 0):
+        s = ''
+        return s
+
+    def parseMovieList(self, movie_class, s, page_id = 0):
+        result = parseMovieList(movie_class, s)
+        return result
+
+    def updateMovieList(self, movie_class, movie_list, page_id = 0):
+        if page_id == 0:
+            movie_class.movies = []
+        movie_class.movies += movie_list
+        movie_class.pages[page_id] = copy.copy(movie_list)
+
+    def fetchMovie(self, cls, movie):
+        url = MOVIE_URL % movie.id
+        s = download(url)
+        return s
+
+    def parseMovie(self, cls, movie, s):
+        result = parseMovieFileList(movie, s)
+        return result
+
+    def updateMovie(self, cls, movie, file_list, page_id = 0):
+        movie.setFiles(file_list, page_id)
+
+if __name__ == '__main__':
+    def test_get_class_list():
+        cls_list = getMovieClassList()
+        for cls in cls_list:
+            print cls
+
+    def test_get_movie_list():
+        movie_list = getMovieList(141)
+        for movie in movie_list:
+            s = movie.fetchMetaData()
+            movie.parseMetaData(s)
+            print movie
+
+    def test_get_movie_file_list():
+        file_list = getMovieFileList(13020)
+        for f in file_list:
+            print f
+
+    #test_get_class_list()
+    #test_get_movie_list()
+    #test_get_movie_file_list()
author	Luo Jinghua <sunmoon1997@gmail.com>	2010-11-13 01:05:31 +0800
committer	Luo Jinghua <sunmoon1997@gmail.com>	2010-11-13 01:05:31 +0800
commit	9caedccf7271d1ccfb3790b698f206b5c7958f5a (patch)
tree	f4320d45142dda9ed4c3cd09efef977815802700
parent	64456b89c708bc5e2f64276e0cb48c3a7ed75660 (diff)