蛋疼脚本:Baidu直播贴终结者

  前几天看到一个百度dota吧的欢乐直播贴,要说在baidu贴吧看直播贴实在是十分蛋疼的一件事情,连个只看楼主的功能都没有(比dzh神马的差远了),于是只能自己动手了。

#coding: utf-8
"""
百度直播贴提取脚本
@author:piglei2007@gmail.com
@version:1.0
"""
import urllib
import urlparse
from BeautifulSoup import BeautifulSoup

class BaiduZhibo(object):
    """
    可迭代的对象,不停返回下一页含有给定username的内容(默认为楼主)
    返回格式:
        字典:{
            "url": "..."        #当前链接地址
            "page": 5           #当前页数
            "content": [...]    #列表,里面有当前页每一个指定username的发言内容
        }
        
    参数:
        url:    帖子地址
        obj_name:   需要抓取的用户昵称,默认为楼主
        limit:  限定抓取页面的数量,默认无限制
        html:   设定输出格式,True不做处理,False替换换行符、空格
    """
    
    def __init__(self, url, obj_name=None, limit=0, html=False):
        self.starturl = url
        self.obj_name = obj_name
        self.limit = limit
        self.counter = 0
        self.html = html
        self.nowurl = url
        self.page = 1
        
    def next(self):
        if (self.limit and self.counter == self.limit) or (self.nowurl is None):
            print "finished."
            raise StopIteration
        
        result = {}
        result["url"] = self.nowurl
        result["page"] = self.page
        result["content"] = []
        content = urllib.urlopen(self.nowurl).read().decode("gbk", "ignore")
        soup = BeautifulSoup(content)
        posts = soup.findAll("div", {"class": "post"})
        for post in posts:
            username = self.get_username(post)
            if not self.obj_name:
                self.obj_name = username
            if username == self.obj_name:
                floor, cc = self.parse_post(post)
                
                if not self.html:
                    cc = self.replace_htmltag(cc)
                result["content"].append({
                    "floor": floor,
                    "content": cc,
                })
        self.nowurl = self.get_nexturl(soup)
        self.counter += 1
        self.page += 1
        return result
        
    def __iter__(self):
        return self
    
    def get_username(self, post):
        """
        获得用户姓名
        """
        return post.find("td").get("username")
    
    def parse_post(self, post):
        """
        返回楼数和内容
        """
        _content = post.find('td', {"class": "content"})
                        
        floor = _content.find('p', {"class": "floor"}).renderContents()
        cc = _content.find('cc').renderContents()
        return floor, cc

    def get_nexturl(self, soup):
        """
        从页面获得下一页url
        """
        next = soup.find(text=u"下一页")
        if next:
            return urlparse.urljoin(self.starturl, next.parent.get("href")).encode("utf-8")
        return None
        
    @staticmethod
    def replace_htmltag(content):
        content = content.replace("<br />", "\n")
        content = content.replace("&nbsp;", " ")
        return content

objfile = open("gua_hacker.html", "w")

if __name__ == '__main__':
    bz = BaiduZhibo("http://tieba.baidu.com/f?kz=923968151", html=True)
    objfile.write('''
    <html>
    <head>
    <meta http-equiv="content-Type" content="text/html; charset=utf-8" />
    <style type="text/css">
    * {font-size: 14px;margin: 0;padding: 0}
    .content_top {
        padding: 10px 4px;
        font-weight: bold;
    }
    .content_main {
        padding: 10px 4px;
        border-bottom: 1px solid #e0e0e0;
    }
    .floor {color: gray}
    </style>
    </head>
    <body>
    <div style="width: 950px;margin: auto">
    ''')
    for x in bz:
        if not x["content"]:
            continue
        print x["page"], x["url"]
        objfile.write('''
        <div class="content_top">
        第 %(page)s 页:<a href="%(url)s" target="_blank">%(url)s</a>
        </div>
        ''' % x)
        for each in x["content"]:
            objfile.write('''
            <div class="content_main">
                <span class="floor">%(floor)s</span>:%(content)s
            </div>
            ''' % each)
    objfile.write('''
    </div>
    </body>
    </html>
    ''')

  这个便是脚本的效果页面了:http://zlovezl.cn/static/uploaded/2010/11/gua_hacker.html

  用到了一个解析html/xml的包,现学现写的,挺好用,地址在这:http://www.crummy.com/software/BeautifulSoup/