from bs4 import BeautifulSoup
import urllib.request
from urllib import parse
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from os import path
import re
import numpy as np
from PIL import Image
def jnews_link_scroll(kw, page=1 ):
params = [] # [링크,제목]
kw = parse.quote(kw) # 키워드 인코딩
for i in range(1, page+1):
# URL 처리/변환
# 페이지와 키워드가 변함에 따라 url 에서 바뀌는 부분을 확인 후 아래처럼 넣어주기.
list_url = f"https://news.joins.com/Search/JoongangNews?page={i}&Keyword={kw}&SortType=New&SearchCategoryType=JoongangNews"
url = urllib.request.Request(list_url)
result_html = urllib.request.urlopen(url).read().decode("utf-8")
soup = BeautifulSoup( result_html, "html.parser")
# 특정 태그 찾아내기
# find_all( tag, class_= , id= , )
res = soup.find_all("h2", class_='headline mg')
# 텍스트 및 링크 추출 (get, get_text)
for i in res:
for j in i:
params.append([j.get("href"), j.get_text("href")])
return params
def jnews_detail_scroll(kw, page=1, path='d:\\data'):
# 1. 링크,제목 리스트
list_url = jnews_link_scroll(kw, page)
# 2. 본문 저장
f = open(path+f'\\jnews_{kw}{page}.txt', 'w', encoding='utf-8')
for i in list_url:
# URL 처리/변환
url = urllib.request.Request(i[0])
result_html = urllib.request.urlopen(url).read().decode("utf-8")
soup = BeautifulSoup( result_html, "html.parser")
# 특정 태그 찾아내기
res = soup.find_all("div", id="article_body")
# 텍스트 추출
for j in res:
f.write( f'기사제목: {i[1]}\n'+str(j.get_text(" ", strip=True)) +'\n' + '\n' )
f.close()
def draw_wordcloud(kw, shape='korea', page=1):
mask1 = np.array(Image.open(f"c:/project/{shape}_im.png"))
# 본문 기사 저장해서 읽어오기
jnews_detail_scroll(kw, page, path='d:\\data')
script = path.dirname(f'd:\\data\\jnews_{kw}{page}.txt')
text = open(f'd:\\data\\jnews_{kw}{page}.txt', mode="r", encoding="utf-8")
text2 = text.read()
# word: 제외할 단어
file = open('d:/project/word.txt', 'r', encoding = 'utf-8')
word = file.read().split(' ')
for i in word:
text2 = re.sub(i,'',text2)
# WordCloud 생성&저장
wordcloud = WordCloud(font_path='C://Windows//Fonts//gulim',
stopwords=STOPWORDS,
max_words=1000,
background_color='white',
max_font_size = 100,
min_font_size = 1,
mask = mask1,
colormap='jet').generate(text2).to_file('d:/project/cnn_cloud.png')
# WordCloud 출력
plt.figure(figsize=(15,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
file.close()
text.close()