爬取虎扑nba新闻

日期:2024-07-01 13:33 | 人气:

import requests
import re
from bs4 import BeautifulSoup
import jieba.analyse
from PIL import Image,ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
url ='https://voice.hupu.com/nba/1'
# 获得虎扑网nba新闻前12000条信息的标题和内容
def AlltitleAndUrl(url):
    j=0
    reslist = requests.get(url)
    reslist.encoding = 'utf-8'
    soup_list = BeautifulSoup(reslist.text, 'html.parser')
    for news in soup_list.select('li'):  # 首页
        if len(news.select('h4')) > 0:
            j=j+1
            print(j)
            # 标题
            title = news.find('h4').text
            href=news.find('h4').a['href']
            reslist = requests.get(href)
            reslist.encoding = 'utf-8'
            soup = BeautifulSoup(reslist.text, 'html.parser')
            context=soup.select('div .artical-main-content')[0].text
            f = open('dongman.txt', 'a', encoding='utf-8')
            f.write(title)
            f.write(context)
            f.close()
            print("文章标题:" + title)
            print(context)
            # print('https://voice.hupu.com/nba/%s' %i)

    # 后面的页数
    for i in range(2, 201):
        pages = i;
        nexturl = 'https://voice.hupu.com/nba/%s' % (pages)
        # nexturl = '%s%s%s' % (head, pages, tail)
        newcontent = requests.get(nexturl)
        newcontent.encoding = 'utf-8'
        soup_alllist = BeautifulSoup(newcontent.text, 'html.parser')

        for news in soup_list.select('li'):
            if len(news.select('h4')) > 0:
                j = j + 1
                # 标题
                title = news.find('h4').text
                href = news.find('h4').a['href']
                reslist = requests.get(href)
                reslist.encoding = 'utf-8'
                soup = BeautifulSoup(reslist.text, 'html.parser')
                context = soup.select('div .artical-main-content')[0].text
                f = open('dongman.txt', 'a', encoding='utf-8')
                f.write(title)
                f.write(context)
                f.close()
                print("文章标题:" + title)
                print(context)
                print(j)


def getWord():
    lyric = ''
    f = open('3.txt', 'r', encoding='utf-8')
    # 将文档里面的数据进行单个读取,便于生成词云
    for i in f:
        lyric += f.read()
    #     进行分析
    result = jieba.analyse.textrank(lyric, topK=2000, withWeight=True)
    keywords = dict()
    for i in result:
        keywords[i[0]] = i[1]
    print(keywords)

    # 获取词云生成所需要的模板图片
    image = Image.open('body.png')
    graph = np.array(image)
    # 进行词云的设置
    wc = WordCloud(font_path='https://blog.csdn.net/weixin_30896511/article/details/fonts/simhei.ttf',  background_color='White',max_words=230, mask=graph, random_state=30,scale=1.5)
    wc.generate_from_frequencies(keywords)
    image_color = ImageColorGenerator(graph)
    plt.imshow(wc)
    plt.imshow(wc.recolor(color_func=image_color))
    plt.axis("off")
    plt.show()
    wc.to_file('dream.png')


getWord()
AlltitleAndUrl(url)

旋转小火锅定制流程

免费咨询

提供图纸

免费设计

免费报价

无忧安装

终身维护

平台注册入口