summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuo Jinghua <sunmoon1997@gmail.com>2010-01-21 23:19:36 +0800
committerLuo Jinghua <sunmoon1997@gmail.com>2010-01-21 23:19:36 +0800
commit71cb1f2636a27479d378da957a2129d98657f753 (patch)
tree58ad2ddb65512586c44386d6e472289f83f36187
parentdbd3bd13da70d22c1e3e965aa9270522e9d63884 (diff)
ppslist: add methods for parsing long play lists
-rw-r--r--totem/plugin/ppslist.py90
1 files changed, 90 insertions, 0 deletions
diff --git a/totem/plugin/ppslist.py b/totem/plugin/ppslist.py
index b507801..8510a5c 100644
--- a/totem/plugin/ppslist.py
+++ b/totem/plugin/ppslist.py
@@ -367,6 +367,58 @@ def parseMovieFileList(movie, s):
ci += 1
return result
+def getMovieFileListFrameUrl(s):
+ s = gbk2utf8(s)
+ soup = BeautifulSoup.BeautifulSoup(s)
+ result = []
+ play_iframe = soup.findAll('iframe', { 'id': 'play_iframe' })
+ if not play_iframe:
+ return ''
+ return play_iframe[0]['src']
+
+def getMovieFileListFrameBaseUrl(s):
+ url = getMovieFileListFrameUrl(s)
+ if not url:
+ return url
+ pos = url.rfind('/')
+ if pos > 0:
+ return url[0:pos]
+ return url
+
+def parseMoviePlayList(s):
+ s = gbk2utf8(s)
+ soup = BeautifulSoup.BeautifulSoup(s)
+ seein = soup.findAll('div', { 'class': 'onseeIn' })
+ if not seein:
+ return ()
+ uls = seein[0].findAll('ul')
+ if not uls:
+ return ()
+ items = uls[0].findAll('li')
+ if not items:
+ return ()
+
+ files = []
+ for item in items:
+ f = PPSFile()
+ fid = extractNumber(item['id'])
+ refs = item.findAll('a')
+ title = refs[0].contents[-1]
+ url = refs[1]['href']
+ f.id = fid
+ f.title = unicode(title).encode('utf-8')
+ f.url = unicode(url).encode('utf-8')
+ files.append(f)
+
+ pagenav = soup.findAll('div', { 'class': 'pageNav' })
+ if not pagenav:
+ return ()
+ refs = pagenav[0].findAll('a')
+ maxpage = int(refs[-1].contents[-1])
+ spans = pagenav[0].findAll('span')
+ curpage = int(spans[0].contents[-1])
+ return (files, max(maxpage, curpage))
+
def getMovieFileList(movie):
s = download(movie.playerurl)
return parseMovieFileList(movie, s)
@@ -467,6 +519,43 @@ if __name__ == '__main__':
sys.exit(0)
+ def test_get_file_list_by_iframe():
+ url = 'http://kan.pps.tv/play/298356.html'
+ s = download(url)
+ movie = PPSMovie()
+ file_list = parseMovieFileList(movie, s)
+ for f in file_list:
+ print f
+
+ if file_list:
+ return
+ baseurl = getMovieFileListFrameBaseUrl(s)
+ print 'baseurl = ', baseurl
+
+ first = baseurl + '/0.html'
+ s = download(first)
+ if not s:
+ return
+
+ last_maxpage = 0
+ cur_page = 0
+ while True:
+ url = baseurl + '/%d.html' % cur_page
+ s = download(url)
+ if not s:
+ break
+ result = parseMoviePlayList(s)
+ if not result:
+ break
+ files, max_page = result
+ print 'maxpage = ', max_page, cur_page
+ for f in files:
+ print f
+
+ if max_page == cur_page:
+ break
+ cur_page += 1
+
def test_search ():
ppslist = PPSList()
s = ppslist.searchMovieList('周星驰')
@@ -520,3 +609,4 @@ if __name__ == '__main__':
#test_ppslist()
#test_search()
#test_get_file_list()
+ #test_get_file_list_by_iframe()