summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuo Jinghua <sunmoon1997@gmail.com>2010-11-14 14:57:24 +0800
committerLuo Jinghua <sunmoon1997@gmail.com>2010-11-14 14:57:24 +0800
commit3f12a9fc8e57c568066a9ea2698b118b975c9df3 (patch)
tree31e165dd32fecabcc679e1b36937bcbcd8bf57cb
parent9caedccf7271d1ccfb3790b698f206b5c7958f5a (diff)
ppslists: add support for parsing baike page
-rw-r--r--totem/plugin/ppslist2.py392
1 files changed, 223 insertions, 169 deletions
diff --git a/totem/plugin/ppslist2.py b/totem/plugin/ppslist2.py
index 8339ad4..8d3053f 100644
--- a/totem/plugin/ppslist2.py
+++ b/totem/plugin/ppslist2.py
@@ -14,6 +14,8 @@ import copy
import sys
import time
import zipfile
+import urllib2
+import urllib2cache
from xml.dom.minidom import Document
from xml.dom import minidom
@@ -23,13 +25,16 @@ GENERAS_URL = 'http://list1.ppstream.com/class/generas.xml.zip'
SUB_URL = 'http://list1.pps.tv/class/%d.xml.zip'
MOVIE_URL = 'http://list1.ppstv.com/schs/%d.xml.zip'
+CachePath = "/tmp/ppslist2"
+opener = urllib2.build_opener(urllib2cache.CacheHandler(CachePath))
+
def download(url, max_retry = 3, interval = 5):
if not url:
return ''
#print 'downloading ', url
for i in range (max_retry):
try:
- res = urllib.urlopen (url).read ()
+ res = opener.open (url).read ()
except Exception, e:
print "Couldn't open url: ", e
res = None
@@ -82,6 +87,81 @@ def extractNavigableStrings(node):
for content in node.contents:
if type(content) is BeautifulSoup.NavigableString:
result.append(extractString(content))
+ elif hasattr(content, 'contents'):
+ result += extractNavigableStrings(content)
+ return result
+
+def parseOptionList(s):
+ r = {}
+ opts = s.split(';')
+ for opt in opts:
+ l = opt.split('=', 1)
+ if len(l) < 2:
+ continue
+ k = l[0]
+ v = l[1]
+ v = v.strip("'")
+ r[k] = v
+ return r
+
+def parseArea (node):
+ area = unicode(extractString(node.contents[-1]), 'utf-8')
+ if area.find (u':') >= 0:
+ area = area[area.find(u':') + 1:]
+ return area.encode('utf-8')
+
+def parseActors(node):
+ actors = node.findAll('a')
+ result = []
+ for actor in actors:
+ if actor.contents:
+ result.append(extractString(actor.contents[-1]))
+ result = {}.fromkeys(result).keys()
+ return result
+
+def parseMetaDataNode(node):
+ result = {}
+ result['image'] = extractString(node.find('img')['src'])
+ li = node.findAll('li')
+ href = li[0].find('a')
+ if len(href) >= 2:
+ result['area'] = parseArea(href[1].contents[-1])
+ result['pubtime'] = extractString(href[0].contents[-1])
+ else:
+ result['area'] = ''
+ result['pubtime'] = ''
+ result['actor'] = ', '.join(parseActors(li[1]))
+ result['desc'] = extractString(li[2].contents[-1])
+ return result
+
+def parseBKMetaDataNode(node):
+ result = {}
+ #print node
+ details = node.find('div', { 'id': 'bk_details' })
+ if not details:
+ return result
+ result['image'] = extractString(details.find('img')['src']).replace('/small/', '/navi/')
+ tfile = details.find('div', { 'class': 'tfile'})
+ li = tfile.findAll('li')
+ actors = extractNavigableStrings(li[0])
+ if len(li) > 3:
+ pubtime = extractString(li[3].contents[-1]).split(':')[1].strip()
+ else:
+ pubtime = ''
+ if len(li) > 4:
+ areas = extractNavigableStrings(li[4].find('span'))
+ areas = (''.join(areas[1:])).split(':')[1].strip()
+ else:
+ areas = ''
+ descnode = node.find('div', { 'class' : "boxIn", 'id' : "drama_desc" })
+ if descnode:
+ desc = ''.join(extractNavigableStrings(descnode))
+ else:
+ desc = ''
+ result['area'] = areas
+ result['pubtime'] = pubtime
+ result['actor'] = ''.join(actors[2:])
+ result['desc'] = desc
return result
class PPSClass:
@@ -94,9 +174,13 @@ class PPSClass:
self.order = ''
self.seq = ''
self.count = 0
+ self.score = 0
+ self.desc_url = ''
+ self.meta_data = {}
self.pages = {}
self.max_page = 1
self.parent = None
+ self.children = []
def __str__(self):
return 'PPSClass<%d %s %s>' % (self.id, self.title, self.count)
@@ -113,6 +197,9 @@ class PPSClass:
d['orderid'] = self.orderid
d['seq'] = self.seq
d['count'] = self.count
+ d['score'] = self.score
+ d['desc_url'] = self.desc_url
+ d['meta_data'] = self.meta_data
return d
def parse (self, node):
@@ -120,16 +207,41 @@ class PPSClass:
self.url = ''
self.id = int(node.getAttribute('id'))
self.order = str(node.getAttribute('order'))
- self.orderid = int(node.getAttribute('orderid'))
- self.seq = str(node.getAttribute('order'))
- opt = str(node.getAttribute('opt'))
- opts = opt.split(';')
- for opt in opts:
- k, v = opt.split('=')
+ if node.hasAttribute('orderid'):
+ self.orderid = int(node.getAttribute('orderid'))
+ self.seq = str(node.getAttribute('seq'))
+ if node.hasAttribute('opt'):
+ opt = unicode(node.getAttribute('opt')).encode('utf-8')
+ else:
+ opt = unicode(node.getAttribute('op')).encode('utf-8')
+ opts = parseOptionList(opt)
+ for k in opts:
+ v = opts[k]
if k == 'sc':
self.count = int(v)
+ elif k == 'url':
+ vs = v.split('|')
+ if len(vs) >= 3:
+ self.desc_url = vs[2]
+ elif k == 'vm':
+ self.score = float(v or '0.0')
return self
+ def parseMetaData(self, s):
+ if not s:
+ return {}
+ s = gbk2utf8(s)
+ soup = BeautifulSoup.BeautifulSoup(s)
+ nodes = soup.findAll('div', { "class" : "pltr" })
+ if not nodes:
+ return
+ self.meta_data = parseMetaDataNode(nodes[0])
+
+ def fetchMetaData(self):
+ if not self.desc_url:
+ return None
+ return download(self.desc_url)
+
def load(self, d):
self.__dict__.update(d)
@@ -180,72 +292,16 @@ class PPSClass:
def getMaxPage(self):
return self.max_page
-class PPSFile:
- elements = [ 'id', 'ci', 'size', 'url' ]
-
- def __str__ (self):
- return 'PPSFile<%d %s %s>' % (self.id, self.title, self.url)
-
- def __repr__(self):
- return repr(self.dictionary())
-
- def __eq__ (self, other):
- for attr in PPSFile.elements:
- if getattr (self, attr) != getattr (other, attr):
- #print attr, getattr (self, attr), getattr (other, attr)
- return False
- return True
-
- def dictionary(self):
- d = {}
- d['id'] = self.id
- d['url'] = self.url
- d['title'] = self.title
- return d
-
- def __init__ (self):
- self.id = 0
- self.ci = 0
- self.size = 0
- self.title = ''
- self.url = ''
- self.parent = None
-
- def parse (self, node, ci):
- def findTexthildNode(node):
- if node.nodeType == minidom.Node.TEXT_NODE:
- return node
- for child in node.childNodes:
- print child.nodeType
- if child.nodeType == minidom.Node.TEXT_NODE:
- return child
- return None
-
- idnode = node.getElementsByTagName('ID')[0]
- namenode = node.getElementsByTagName('Name')[0]
- urlnode = node.getElementsByTagName('URL')[0]
- nametext = findTexthildNode(namenode)
- urltext = findTexthildNode(urlnode)
- self.id = int(idnode.getAttribute('ID'))
- if nametext:
- self.title = unicode(nametext.data.strip()).encode('utf-8')
- if urltext:
- self.url = unicode(urltext.data.strip()).encode('utf-8')
- self.ci = ci
- return self
-
- def load(self, d):
- self.__dict__.update(d)
+ def setCount(self, count):
+ self.count = count
class PPSMovie:
elements = [ 'id', 'title', 'director', 'actor',
'area', 'size', 'pubtime', 'length',
- 'lang', 'score', 'desc', 'image',
- 'cn']
+ 'lang', 'score', 'desc', 'image']
def __str__(self):
- return 'PPSMovie<%d %s %s %d %s>' % (self.id, self.title,
- self.score, self.cn,
- self.actor)
+ return 'PPSMovie<%d %s %s %s>' % (self.id, self.title,
+ self.score, self.actor)
def __eq__ (self, other):
for i in PPSMovie.elements:
@@ -264,6 +320,7 @@ class PPSMovie:
def __init__ (self):
self.id = 0
+ self.bkid = 0
self.title = ''
self.order = ''
self.director = ''
@@ -276,74 +333,69 @@ class PPSMovie:
self.score = 0
self.desc = ''
self.image = ''
- self.cn = 1
- self.baseurl = ''
+ self.url = ''
self.max_page = 1
self.files = {}
self.pixbuf = None
- self.parent = None
self.desc_url = ''
+ self.meta_data = {}
+ self.parent = None
def parse (self, node):
self.files = {}
self.pixbuf = None
self.image = ''
self.score = 0
- self.title = unicode(node.getAttribute('name')).encode('utf-8')
- self.cn = 1
- self.id = int(node.getAttribute('id'))
- self.order = str(node.getAttribute('order'))
self.area = ''
self.pubtime = ''
self.actor = ''
self.desc = ''
- self.smallimage = self.image
- self.seq = str(node.getAttribute('order'))
- opt = str(node.getAttribute('op'))
- opts = opt.split(';')
- for opt in opts:
- k, v = opt.split('=', 1)
- if k == 'sc':
- self.cn = int(v.strip("'"))
- elif k == 'url':
- vs = v.strip("'").split('|')
- if len(vs) >= 3:
- self.desc_url = vs[2]
- elif k == 'vm':
- self.score = float(v.strip("'"))
+
+ def findTexthildNode(node):
+ if node.nodeType == minidom.Node.TEXT_NODE:
+ return node
+ for child in node.childNodes:
+ if child.nodeType == minidom.Node.TEXT_NODE:
+ return child
+ return None
+
+ idnode = node.getElementsByTagName('ID')[0]
+ namenode = node.getElementsByTagName('Name')[0]
+ urlnode = node.getElementsByTagName('URL')[0]
+ nametext = findTexthildNode(namenode)
+ urltext = findTexthildNode(urlnode)
+ self.id = int(idnode.getAttribute('ID'))
+ self.score = float(idnode.getAttribute('VM') or '0.0')
+ self.bkid = int(idnode.getAttribute('BKID'))
+ if nametext:
+ self.title = unicode(nametext.data.strip()).encode('utf-8')
+ if urltext:
+ self.url = unicode(urltext.data.strip()).encode('utf-8')
return self
def fetchMetaData(self):
- if not self.desc_url:
+ if not self.parent:
return None
- s = download(self.desc_url)
+ if self.parent.desc_url:
+ url = self.parent.desc_url
+ else:
+ url = 'http://kan.pps.tv/detail/%d.html?c_id=%d' % (self.id, self.parent.id)
+ s = download(url)
return s
- def parse_area (self, node):
- area = unicode(extractString(node.contents[-1]), 'utf-8')
- if area.find (u':') >= 0:
- area = area[area.find(u':') + 1:]
- return area.encode('utf-8')
-
- def parse_actors(self, node):
- actors = node.findAll('a')
- result = []
- for actor in actors:
- if actor.contents:
- result.append(extractString(actor.contents[-1]))
- result = {}.fromkeys(result).keys()
- return result
+ def fetchBKMetaData(self):
+ url = 'http://bk.pps.tv/ct%d/' % self.bkid
+ s = download(url)
+ return s
- def parseMetaDataNode(self, node):
- self.image = extractString(node.find('img')['src'])
- li = node.findAll('li')
- href = li[0].find('a')
- if len(href) >= 2:
- self.area = self.parse_area(href[1].contents[-1])
- self.pubtime = extractString(href[0].contents[-1])
- self.actor = ', '.join(self.parse_actors(li[1]))
- self.desc = extractString(li[2].contents[-1])
- self.smallimage = self.image
+ def updateMetaData(self, meta_data):
+ if meta_data:
+ self.actor = meta_data['actor']
+ self.desc = meta_data['desc']
+ self.area = meta_data['area']
+ self.pubtime = meta_data['pubtime']
+ self.image = meta_data['image']
+ self.meta_data = meta_data
def parseMetaData(self, s):
if not s:
@@ -353,30 +405,24 @@ class PPSMovie:
nodes = soup.findAll('div', { "class" : "pltr" })
if not nodes:
return
- self.parseMetaDataNode(nodes[0])
+ meta_data = parseMetaDataNode(nodes[0])
+ self.updateMetaData(meta_data)
+
+ def parseBKMetaData(self, s):
+ if not s:
+ return
+ s = gbk2utf8(s)
+ soup = BeautifulSoup.BeautifulSoup(s)
+ nodes = soup.findAll('div', { "id" : "container" })
+ if not nodes:
+ return
+ meta_data = parseBKMetaDataNode(nodes[0])
+ self.updateMetaData(meta_data)
def load(self, d):
self.__dict__.update(d)
self.smallimage = self.image
- def getFiles(self, page_id = 0):
- if page_id in self.files:
- return self.files[page_id]
- return []
-
- def getAllFiles(self):
- keys = self.files.keys()
- keys.sort()
- result = []
- for key in keys:
- result += self.files[key]
- return result
-
- def setFiles(self, files, page_id = 0):
- self.files[page_id] = copy.copy(files)
- for f in self.files[page_id]:
- f.parent = self
-
def setMaxPage(self, max_page):
self.max_page = max_page
@@ -400,11 +446,7 @@ def getMovieClassList():
ppslist = download(GENERAS_URL)
return parseMovieClassList(ppslist)
-def getMovieList(clsid):
- s = download(SUB_URL % clsid)
- return parseMovieList(clsid, s)
-
-def parseMovieList(clsid, s):
+def parseMovieSubclassList(movie_cls, s):
s = unpack_zip(s)
s = xmlgbk2utf8(s)
dom = minidom.parseString(s)
@@ -412,12 +454,17 @@ def parseMovieList(clsid, s):
subs = node[0].getElementsByTagName('Sub')
result = []
for sub in subs:
- movie = PPSMovie()
+ movie = PPSClass()
movie.parse(sub)
+ movie.parent = movie_cls
result.append(movie)
return result
-def parseMovieFileList(movie_id, s):
+def getMovieSubclassList(cls):
+ s = download(SUB_URL % cls.id)
+ return parseMovieSubclassList(cls, s)
+
+def parseMovieList(movie_id, s):
s = unpack_zip(s)
s = xmlgbk2utf8(s)
dom = minidom.parseString(s)
@@ -426,16 +473,15 @@ def parseMovieFileList(movie_id, s):
result = []
ci = 0
for f in chs:
- ppsfile = PPSFile()
- result.append(ppsfile.parse(f, ci))
+ movie = PPSMovie()
+ result.append(movie.parse(f))
ci += 1
return result
-def getMovieFileList(movie_id):
+def getMovieList(movie_id):
url = MOVIE_URL % movie_id
- print url
s = download(url)
- return parseMovieFileList(movie_id, s)
+ return parseMovieList(movie_id, s)
class PPSList:
def __init__(self):
@@ -458,11 +504,21 @@ class PPSList:
result = []
return result
+ def fetchSubclasses(self, movie_class):
+ s = download(SUB_URL % movie_class.id)
+ return s
+
+ def parseSubclasses(self, movie_class, s):
+ return parseMovieSubclassList(movie_class, s)
+
def updateClasses(self, classes):
self.classes += copy.copy(classes)
+ def updateSubclasses(self, movie_class, subclasses):
+ movie_class.children += copy.copy(subclasses)
+
def fetchMovieList(self, movie_class, page_id = 0):
- url = SUB_URL % movie_class
+ url = MOVIE_URL % movie_class.id
s = download(url)
return s
@@ -480,36 +536,34 @@ class PPSList:
movie_class.movies += movie_list
movie_class.pages[page_id] = copy.copy(movie_list)
- def fetchMovie(self, cls, movie):
- url = MOVIE_URL % movie.id
- s = download(url)
- return s
-
- def parseMovie(self, cls, movie, s):
- result = parseMovieFileList(movie, s)
- return result
-
- def updateMovie(self, cls, movie, file_list, page_id = 0):
- movie.setFiles(file_list, page_id)
-
if __name__ == '__main__':
def test_get_class_list():
cls_list = getMovieClassList()
for cls in cls_list:
print cls
+ subcls_list = getMovieSubclassList(cls)
+ for subcls in subcls_list:
+ print '\t', subcls
+
+ def test_get_subclass_list():
+ s = download(SUB_URL % 22)
+ subcls_list = parseMovieSubclassList(None, s)
+ for subcls in subcls_list:
+ s = subcls.fetchMetaData()
+ subcls.parseMetaData(s)
+ print subcls
def test_get_movie_list():
- movie_list = getMovieList(141)
- for movie in movie_list:
- s = movie.fetchMetaData()
- movie.parseMetaData(s)
- print movie
-
- def test_get_movie_file_list():
- file_list = getMovieFileList(13020)
+ #http://kan.pps.tv/detail/313410.html?c_id=13020
+ file_list = getMovieList(13020)
for f in file_list:
print f
+ s = f.fetchBKMetaData()
+ f.parseBKMetaData(s)
+ print 'meta_data:', f.meta_data
+ if s:
+ break
#test_get_class_list()
- #test_get_movie_list()
- #test_get_movie_file_list()
+ #test_get_subclass_list()
+ test_get_movie_list()