爬取虎扑nba新闻
日期:2024-07-01 13:33 | 人气:
import requests import re from bs4 import BeautifulSoup import jieba.analyse from PIL import Image,ImageSequence import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud,ImageColorGenerator url ='https://voice.hupu.com/nba/1' # 获得虎扑网nba新闻前12000条信息的标题和内容 def AlltitleAndUrl(url): j=0 reslist = requests.get(url) reslist.encoding = 'utf-8' soup_list = BeautifulSoup(reslist.text, 'html.parser') for news in soup_list.select('li'): # 首页 if len(news.select('h4')) > 0: j=j+1 print(j) # 标题 title = news.find('h4').text href=news.find('h4').a['href'] reslist = requests.get(href) reslist.encoding = 'utf-8' soup = BeautifulSoup(reslist.text, 'html.parser') context=soup.select('div .artical-main-content')[0].text f = open('dongman.txt', 'a', encoding='utf-8') f.write(title) f.write(context) f.close() print("文章标题:" + title) print(context) # print('https://voice.hupu.com/nba/%s' %i) # 后面的页数 for i in range(2, 201): pages = i; nexturl = 'https://voice.hupu.com/nba/%s' % (pages) # nexturl = '%s%s%s' % (head, pages, tail) newcontent = requests.get(nexturl) newcontent.encoding = 'utf-8' soup_alllist = BeautifulSoup(newcontent.text, 'html.parser') for news in soup_list.select('li'): if len(news.select('h4')) > 0: j = j + 1 # 标题 title = news.find('h4').text href = news.find('h4').a['href'] reslist = requests.get(href) reslist.encoding = 'utf-8' soup = BeautifulSoup(reslist.text, 'html.parser') context = soup.select('div .artical-main-content')[0].text f = open('dongman.txt', 'a', encoding='utf-8') f.write(title) f.write(context) f.close() print("文章标题:" + title) print(context) print(j) def getWord(): lyric = '' f = open('3.txt', 'r', encoding='utf-8') # 将文档里面的数据进行单个读取,便于生成词云 for i in f: lyric += f.read() # 进行分析 result = jieba.analyse.textrank(lyric, topK=2000, withWeight=True) keywords = dict() for i in result: keywords[i[0]] = i[1] print(keywords) # 获取词云生成所需要的模板图片 image = Image.open('body.png') graph = np.array(image) # 进行词云的设置 wc = WordCloud(font_path='https://blog.csdn.net/weixin_30896511/article/details/fonts/simhei.ttf', background_color='White',max_words=230, mask=graph, random_state=30,scale=1.5) wc.generate_from_frequencies(keywords) image_color = ImageColorGenerator(graph) plt.imshow(wc) plt.imshow(wc.recolor(color_func=image_color)) plt.axis("off") plt.show() wc.to_file('dream.png') getWord() AlltitleAndUrl(url)