本文共 9253 字,大约阅读时间需要 30 分钟。
第一次登陆需要验证码,之后的登陆可以隐去 “login(”username’,’password’)”,因为使用session保存了必要的登陆信息,代码如下:
import requeststry: import cookielibexcept: import http.cookiejar as cookielibimport reimport timeimport os.pathimport jsonfrom bs4 import BeautifulSouptry: from PIL import Imageexcept: passfrom mywordCloud import save_jieba_resultfrom mywordCloud import draw_wordcloudimport threadingimport codecs# 构造 Request headersagent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'headers = { "Host": "www.douban.com", "Referer": "https://www.douban.com/", 'User-Agent': agent,}#使用cookie登录信息session=requests.session()session.cookies=cookielib.LWPCookieJar(filename='cookies')try: session.cookies.load(ignore_discard=True) print('成功加载cookie')except: print("cookie 未能加载")# 获取验证码def get_captcha(url): #获取验证码 print('获取验证码',url) captcha_url = url r = session.get(captcha_url, headers=headers) print('test') with open('captcha.jpg', 'wb') as f: f.write(r.content) f.close() # 用pillow 的 Image 显示验证码 # 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入 try: im = Image.open('captcha.jpg') im.show() im.close() except: print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg')) captcha = input("please input the captcha\n>") return captchadef isLogin(): #登录个人主页,查看是否登录成功 url='https://www.douban.com/people/151607908/' login_code=session.get(url,headers=headers,allow_redirects=False).status_code if login_code==200: return True else: return Falsedef login(acount,secret): douban="https://www.douban.com/" htmlcha=session.get(douban,headers=headers).text patterncha=r'id="captcha_image" src="(.*?)" alt="captcha"' httpcha=re.findall(patterncha,htmlcha) pattern2=r'type="hidden" name="captcha-id" value="(.*?)"' hidden_value=re.findall(pattern2,htmlcha) print(hidden_value) post_data = { "source": "index_nav", 'form_email': acount, 'form_password': secret } if len(httpcha)>0: print('验证码连接',httpcha) capcha=get_captcha(httpcha[0]) post_data['captcha-solution']=capcha post_data['captcha-id']=hidden_value[0] print (post_data) post_url='https://www.douban.com/accounts/login' login_page=session.post(post_url,data=post_data,headers=headers) #保存cookies session.cookies.save() if isLogin(): print('登录成功') else: print('登录失败')def get_movie_sort(): time.sleep(1) movie_url='https://movie.douban.com/chart' html=session.get(movie_url,headers=headers) soup=BeautifulSoup(html.text,'html.parser') result=soup.find_all('a',{ 'class':'nbg'}) print(result)#爬取短评论def get_comment(filename): #filename为爬取得内容保存的文件 begin=1 comment_url = 'https://movie.douban.com/subject/11600078/comments' next_url='?start=20&limit=20&sort=new_score&status=P' headers2 = { "Host": "movie.douban.com", "Referer": "https://www.douban.com/", 'User-Agent': agent, 'Connection': 'keep-alive', } f=open(filename,'w+',encoding='utf-8') while(True): time.sleep(6) html=session.get(url=comment_url+next_url,headers=headers2) soup=BeautifulSoup(html.text,'html.parser') #爬取当前页面的所有评论 result=soup.find_all('div',{ 'class':'comment'}) #爬取得所有的短评 pattern4 = r'(.*?)' \ r'
' for item in result: s=str(item) count2=s.find('') count3=s.find('
') s2=s[count2+12:count3] #抽取字符串中的评论 if 'class' not in s2: f.write(s2) #获取下一页的链接 next_url=soup.find_all('div',{ 'id':'paginator'}) pattern3=r'href="(.*?)">后页' if(len(next_url)==0): break next_url=re.findall(pattern3,str(next_url[0])) #得到后页的链接 if(len(next_url)==0): #如果没有后页的链接跳出循环 break next_url=next_url[0] print('%d爬取下一页评论...'%begin) begin=begin+1 #如果爬取了5次则多休息2秒 if(begin%6==0): time.sleep(40) print('休息...') print(next_url) f.close()#多线程爬虫,爬取豆瓣影评def thread_get_comment(filename): next_url = '?start=19&limit=20&sort=new_score&status=P' headers2 = { "Host": "movie.douban.com", "Referer": "https://www.douban.com/", 'User-Agent': agent, 'Connection': 'keep-alive', } f = open(filename, 'w+', encoding='utf-8') comment_url = 'https://movie.douban.com/subject/26363254/comments' crawl_queue=[comment_url+next_url] crawl_queue.append('https://movie.douban.com/subject/26363254/comments?start=144&limit=20&sort=new_score&status=P') seen=set(crawl_queue) def process_queue(): begin = 1 while True: try: url=crawl_queue.pop() except IndexError: break else: time.sleep(5) html = session.get(url=url,headers=headers2) soup = BeautifulSoup(html.text, 'html.parser') # 爬取当前页面的所有评论 result = soup.find_all('div', { 'class': 'comment'}) # 爬取得所有的短评 pattern4 = r'(.*?)' \ r'
' for item in result: s = str(item) count2 = s.find('') count3 = s.find('
') s2 = s[count2 + 12:count3] # 抽取字符串中的评论 f.write(s2) # 获取下一页的链接 next_url = soup.find_all('div', { 'id': 'paginator'}) pattern3 = r'href="(.*?)">后页' if (len(next_url) == 0): break next_url = re.findall(pattern3, str(next_url[0])) # 得到后页的链接 if (len(next_url) == 0): # 如果没有后页的链接跳出循环 break next_url = next_url[0] print('%d爬取下一页评论...' % begin) begin = begin + 1 # 如果爬取了6次则多休息2秒 if (begin % 6 == 0): print('休息...') time.sleep(30) print(next_url) if comment_url+next_url not in seen: seen.add(comment_url+next_url) crawl_queue.append(comment_url+next_url) threads=[] max_threads=5 while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads)< max_threads and crawl_queue: thread=threading.Thread(target=process_queue) print('--------下一个线程----------') thread.setDaemon(True) # set daemon so main thread can exit when receive ctrl + C thread.start() threads.append(thread) time.sleep(2) f.close()if __name__=='__main__': if isLogin(): print('您已经登录') else: print('xs') login('dsdz@qq.com','5sdfsd6') file_name='key3.txt' get_comment(file_name) #单线程爬虫 #thread_get_comment(file_name) #多线程爬虫 save_jieba_result(file_name) draw_wordcloud('pjl_jieba.txt')
爬取得评论保存在key3.txt 文本文件中:
第一步需要:安装必要的python 库,其中需要的有 生成图云scipy 、wordcloud。python库的安装方法,可以参考笔者的博客。一切准备就绪之后,就可以使用jieba分词对得到的所有评论进行分词,分词时候就可以绘制图云。
其中主要的代码 mywordCloud.py
from scipy.misc import imreadimport codecsfrom os import pathimport jiebafrom wordcloud import WordCloud#暂时没有用到def get_all_keywords(file_name): word_lists=[] #关键词列表 with codecs.open(file_name,'r',encoding='utf-8') as f: Lists=f.readlines() for li in Lists: cut_list=list(jieba.cut(li)) for word in cut_list: word_lists.append(word) word_lists_set=set(word_lists) #去除相同的元素 sort_count=[] word_lists_set=list(word_lists_set) length=len(word_lists_set) print(u'共有%d个关键词'%length) k = 1 for w in word_lists_set: sort_count.append(w + u':' + str(word_lists.count(w)) + u"次\n") print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"次") k += 1 with codecs.open('count_word.txt', 'w', encoding='utf-8') as f: f.writelines(sort_count)def save_jieba_result(file_name): #设置多线程切割 #jieba.enable_parallel(4) dirs=path.join(path.dirname(__file__),file_name) print(dirs) with codecs.open(dirs,encoding='utf-8') as f: comment_text=f.read() cut_text=" ".join(jieba.cut(comment_text)) with codecs.open('pjl_jieba.txt','w',encoding='utf-8') as f: f.write(cut_text)def draw_wordcloud(file_name): with codecs.open(file_name,encoding='utf-8') as f: comment_text=f.read() color_mask=imread('timg.jpg') #读取背景图片 stopwords = ['png','douban','com','href','https','img','img3','class','source','icon','shire',u'有点',u'真的',u'觉得',u'还是',u'一个',u'就是', u'电影', u'你们', u'这么', u'不过', u'但是', u'什么', u'没有', u'这个', u'那个', u'大家', u'比较', u'看到', u'真是', u'除了', u'时候', u'已经', u'可以'] font = r'C:\Windows\Fonts\simfang.ttf' cloud=WordCloud(font_path=font,background_color='white',max_words=20000,max_font_size=200,min_font_size=10,mask=color_mask,stopwords=stopwords) word_cloud=cloud.generate(comment_text) #产生词云 word_cloud.to_file('mycloud.jpg')
通过上面两个代码,就可以生成漂亮的图云,来预测观看《战狼2》这部电影的人主要评论的关键词:
附上笔者的github源代码地址: