自然言語処理 webスクレイピングによるトレーニングデータの集め方
自然言語処理 webスクレイピングによるトレーニングデータの集め方
今更ながらbotを作りたいなと思ったのですが、機械学習によりモデルを作るためのトレーニングデータを集めるところに苦労したため、トレーニングデータを集めるところだけにフォーカスを当ててて自分が実行した手段を公開したいと思います。使用した言語はpythonです。
今回自分がとったデータの集め方は、「http://ssmania.info/category/ 」 のサイトにアップされているキャラクターのやり取りをもとにトレーニングデータを抽出するやり方です。
全体の流れとしては、 1.リンクを集める 「http://ssmania.info/category/ 」 のサイトに掲載されているURLのリンクを一つのページにまとめる 2.リンクからやり取りを集める まとめたURLからページのやり取りを「input.txt」「output.txt」二つのファイルに保存する。 3.やり取りを単語ごとに分ける MeCabを使用して「input.txt」「output.txt」の二つの文章データを単語ごとに分ける。
前提条件
OS:Windows10 言語:Python 3.6.4
目次
1.リンクを集める 2.リンクからやり取りを集める 3.やり取りを単語ごとに分ける 4.結果
1.リンクを集める
「http://ssmania.info/category/ 」 のサイトの収集したいやり取りのアニメを選択する。 例1)以下のようにデータを取得したいアニメを選択する。 例2)以下のページに移動したら、URLをコピーしてプログラムCollectLinks.pyの11行目の[self.base_url='※※※'] の※※※のところにペーストする。
#!/Users/igaki/.pyenv/shims/python # -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import types class CharaScreipingu(object): def __init__(self): self.exclusion_url = "http://ssmania.info/" self.base_url ="http://ssmania.info/category/765%E3%83%97%E3%83%AD" def _boardLinkSearch(self): self.board_list_url = self.base_url+"?page={}" page = 1 while True: link = list() print(page) board_top_html = requests.get(self.board_list_url.format(page)) board_top_soup = BeautifulSoup(board_top_html.content, "html.parser") main_contents = board_top_soup.find(id="contents") if "該当記事はありませんでした" == board_top_soup.find(class_="alink").text: print("finish") break for a in main_contents.find_all('a'): tmp_link = a.get('href') if self.exclusion_url not in tmp_link: link.append(a.get('href')) link_list = "\n".join(link) link_list = "\n"+link_list with open("./charaLink.txt", "a") as f: f.write(link_list) page += 1 def main(): scrpeing = CharaScreipingu() scrpeing._boardLinkSearch() if __name__ == "__main__": main() print("search Finish")
2.リンクからやり取りを集める
1で収集したリンクが保存されているファイルからキャラクター同士のやり取りを収集する。
#!/Users/igaki/.pyenv/shims/python # -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import types class WordScreipingu(object): def __init__(self): self.urlList ={"blog.livedoor.jp":self._livedoor,"ssbiyori.blog.fc2.com":self._ssbiyori,"yomicom.jp":self._yomicom,"potittoss.blog.jp":self._potittoss,"ss-m.net":self._ss_m,"s2-log.com":self._s2_log,"ss-navi.com":self._ss_navi,"horahorazoon.blog134.fc2.com":self._horahorazoon,"ayame2nd.blog.jp":self._ayame2nd,"ssimas72.blog.jp":self._ssimas72,"elephant.2chblog.jp":self._elephant,"morikinoko.com":self._morikinoko,"amnesiataizen.blog.fc2.com":self._amnesiataizen,"ssblog614.blog.fc2.com":self._ssblog614,"sssokuhou.com":self._sssokuhou,"invariant0.blog130.fc2.com":self._invariant0,"darusoku.xyz":self._darusoku,"ss-station.2chblog.jp":self._ss_station,"minnanohimatubushi.2chblog.jp":self._minnanohimatubushi,"ssmansion.xyz":self._ssmansion,"142ch.blog90.fc2.com":self._142ch,"tangaron3.sakura.ne.jp":self._tangaron3,"ssmaster.blog.jp":self._ssmaster,"dousoku.net":self._dousoku,"ssflash.net":self._ssflash,"lclc.blog.jp":self._lclc,"www.lovelive-ss.com":self._www,"maoyuss.blog.fc2.com":self._maoyuss,"ssspecial578.blog135.fc2.com":self._ssspecial578} def _fileRead(self): with open("./charaLink.txt", "r") as f: num = 1 for link in f.readlines(): self.urlList[link.split("/")[2]](link) num += 1 def _takeCharaWord(self,word): char_num = len(word) flag = 0 with open("./input.txt", "a") as inputfile: with open("./output.txt", "a") as outputfile: for num in range(char_num): if word[num] == "「": num += 1 if flag == 0: while num < char_num and word[num] != "」": inputfile.write(word[num]) num += 1 inputfile.write("\n") flag = 1 else: while num < char_num and word[num] != "」": outputfile.write(word[num]) num += 1 outputfile.write("\n") flag = 0 elif word[num] == "『": num += 1 if flag%2 == 0: while num < char_num and word[num] != "』": inputfile.write(word[num]) num += 1 inputfile.write("\n") flag = 1 else: while num < char_num and word[num] != "』": outputfile.write(word[num]) num += 1 outputfile.write("\n") flag = 0 def _livedoor(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _ssbiyori(self,url): pass def _yomicom(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(_class="sh_heading_main_b") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="ently_text").text) except: pass def _potittoss(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _ss_m(self,url): pass def _s2_log(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _ss_navi(self,url): pass def _horahorazoon(self,url): pass def _ayame2nd(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _ssimas72(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _elephant(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: self._takeCharaWord(livedoor_html_soup.find(class_="article").text) def _morikinoko(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _amnesiataizen(self,url): pass def _ssblog614(self,url): pass def _sssokuhou(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _invariant0(self,url): pass def _darusoku(self,url): pass def _ss_station(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _minnanohimatubushi(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _ssmansion(self,url): pass def _142ch(self,url): pass def _tangaron3(self,url): pass def _ssmaster(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _dousoku(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _ssflash(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _lclc(self,url): livedoor_html = requests.get(url) livedoor_html_soup = BeautifulSoup(livedoor_html.content, "html.parser") if livedoor_html_soup.find(id="character") == None: try: self._takeCharaWord(livedoor_html_soup.find(class_="article-body-inner").text) except: pass def _www(self,url): pass def _maoyuss(self,url): pass def _ssspecial578(self,url): pass def _boardLinkSearch(self): self.board_list_url = "http://ssmania.info/category/765%E3%83%97%E3%83%AD?page={}" page = 1 while True: link = list() print(page) board_top_html = requests.get(self.board_list_url.format(page)) board_top_soup = BeautifulSoup(board_top_html.content, "html.parser") main_contents = board_top_soup.find(id="contents") if "該当記事はありませんでした" == board_top_soup.find(class_="alink").text: print("finish") break for a in main_contents.find_all('a'): tmp_link = a.get('href') if self.exclusion_url not in tmp_link: link.append(a.get('href')) link_list = "\n".join(link) link_list = "\n"+link_list with open("./charaLink.txt", "a") as f: f.write(link_list) page += 1 def temp(): link_list = list() with open("./charaLink.txt", "r") as f: for link in f: link_list.append(link) link_type = list() for link in link_list: if link.split("/")[2] not in link_type: link_type.append(link.split("/")[2]) print(len(link_type)) for link in link_type: print(link) def main(): scrpeing = WordScreipingu() scrpeing._fileRead() if __name__ == "__main__": main() print("search Finish")
3.やり取りを単語ごとに分ける
seq2seqにより学習するためには文章を単語ごとに分ける必要があります。そのため今回はMeCabというオープンソースである形態素解析器を使用しました。 取得したデータが格納されている「input.txt」と「output.txt」からデータを抽出して「input_result.txt」「output_result.txt」にそれぞれ出力しました。
# -*- coding: utf-8 -*- import MeCab import sys mode = MeCab.Tagger("-Ochasen") num = 0 with open("./input.txt","r",encoding="utf-8_sig") as f: for line in f.readlines(): num += 1 line_result = mode.parse(line).split("\n") with open("./input_result.txt","a",encoding="utf-8") as result_file: for word in line_result: if word.split("\t")[0] != "EOS": result_file.write(word.split("\t")[0]) result_file.write(" ") result_file.write("\n") num = 0 with open("./output.txt","r",encoding="utf-8_sig") as f: for line in f.readlines(): num += 1 line_result = mode.parse(line).split("\n") with open("./output_result.txt","a",encoding="utf-8") as result_file: for word in line_result: if word.split("\t")[0] != "EOS": result_file.write(word.split("\t")[0]) result_file.write(" ") result_file.write("\n")
4.結果
結果以下のような内容のデータが収集できました。
... うーん 、 ブラック の 缶 珈琲 って なんとなく 味気 ない 気 が し て で さぁ 春香 今日 な んで 事務所 来 た の ? いや 、 今日 春香 オフ じゃん … それ 、 先週 別 の 日 に なっ たって 言っ た と 思う ん だ けど … ...