matplotlibでラジカツスターズ!のコーナー正解率を分析する
もうひとつのブログ記事でラジカツスターズ!のコーナー正解率を分析するネタをやるので、そのテック部分について書く。
基本はmatplotlibでだらだら書いてるだけ。
ソース
python 3.5
jupiter notebook上での実装
各放送回別の問題数、正解数と、累計正解率のplot
import pandas as pd import matplotlib.pyplot as plt import statsmodels.api as sm import numpy as np import datetime %pylab inline --no-import-all #csvファイルのロード df = pd.read_csv('xxx/radikatsu_stars_list.csv', index_col=0, parse_dates=[1]) df = df.fillna(0) radikatsu = df.loc[0:] # todo 正解数の抽出できなかったので加工 on_air = radikatsu.index # 放送回数 series_correct_num = radikatsu['正解数'].astype(int) series_question_num = radikatsu['出題数'].astype(int) # 正解数と出題数をそれぞれ累計 list_cumulative_correct = [] # 正解数 list_cumulative_question = [] # 出題数 list_cumulative_correct_rate = [] # 正解率 cumulative_correct = 0 cumulative_question = 0 cumulative_correct_rate = 0 for x, y in zip(series_correct_num, series_question_num): cumulative_correct = cumulative_correct + x cumulative_question = cumulative_question + y cumulative_correct_rate = (cumulative_correct / cumulative_question) * 100 list_cumulative_correct.append(cumulative_correct) list_cumulative_question.append(cumulative_question) list_cumulative_correct_rate.append(cumulative_correct_rate) # 累計数をDataFrameにする cumulative = pd.DataFrame( {'01.on_air': on_air, '02.question': list_cumulative_question, '03.correct': list_cumulative_correct, '04.rate': list_cumulative_correct_rate}) cumulative.to_csv("cumulative.csv") # 線形回帰 mod = sm.OLS(list_cumulative_correct, sm.add_constant(on_air)) res = mod.fit() a, b= res.params # 相関係数を取得 # 100問正解するまでに何回かかるか list_on_air_goal = [] num = 0 on_air_goal = 1 while num < 100: num = a + b * on_air_goal list_on_air_goal.append(on_air_goal) on_air_goal = on_air_goal + 1 # 正解数推移 plt.figure(figsize=(20, 10), dpi=100, linewidth = 100) plt.tick_params(labelsize=18) plt.plot(on_air, list_cumulative_correct, marker='o', color='lightblue', ms=15) # 放送回ごとの正解数 plt.plot(list_on_air_goal, a + b * np.array(list_on_air_goal), lw=8, alpha=0.5, ls='-', color='#C7243A') # 直線 plt.plot(max(list_on_air_goal), 100, marker='*', color='#B61972', ms=20) # 目標回 plt.yticks( np.arange(0, 120, 10) ) plt.xticks( np.arange(0, 110, 5) ) plt.ylabel('correct answers', fontsize=15) plt.xlabel('on air', fontsize=15) plt.savefig('list_on_air_goal.png') # グラフのダウンロード plt.show() # 各放送回別の問題数、正解数と、累計正解率 plt.figure(figsize=(20, 10), dpi=100, linewidth = 100) ax = plt.subplot() ax.bar(on_air, series_question_num, color='#44A5CB', align="center", label="Question num") ax.bar(on_air, series_correct_num, color='#EDAD0B', align="center", label="Correct num") ax.legend(loc=2) # 凡例 plt.xlabel('on air', fontsize=15) plt.ylabel("Question/Correct num", fontsize=15) plt.yticks( np.arange(0, max(series_question_num) + 3, 1) ) ax2 = ax.twinx() plt.plot(on_air, list_cumulative_correct_rate, marker='o', color='#C7243A') # 放送回までの正解率 plt.ylabel("Correct answer rate(Cumulative)", fontsize=15) plt.savefig('questionCorrectAll.png') # グラフのダウンロード plt.show()
radikatsu = df.loc[0:] # todo 正解数の抽出できなかったので加工
DataFrameで正解数を取ろうとしたときに、エラーになってうまく取れず、しょうがなく一旦上記でかわしてからとるようにした。
どうやればうまくとれたのかちょっとよくわからん。
メンバー別の問題数、正解数と、累計正解率のplot
def member_answer(member): #担当回チェック list_question = [] #問題数 list_correct = [] #正解数 list_date = [] #放送年月 bfr_date = None question_num = 0 answer_num = 0 len_cur = len(radikatsu) # DataFrameの行単位でループ for index, i in enumerate(radikatsu.iterrows()): # tupleの値部分の取得 series_row = i[1] target = series_row['公開日'].strftime("%Y-%m") # パーソナリティ回だった場合 if series_row[member] == 1: # 同年月だった場合累計 if target == bfr_date: question_num = question_num + series_row['出題数'] answer_num = answer_num + series_row['正解数'] # 1行目もしくは年月が切り替わった場合 if target != bfr_date and bfr_date is not None: list_date.append(bfr_date) list_question.append(question_num) list_correct.append(answer_num) # パーソナリティ回だった場合、その年月を設定しなおす。 if series_row[member] == 1: question_num = series_row['出題数'] answer_num = series_row['正解数'] else: question_num = 0 answer_num = 0 bfr_date = target # 最終行の追加 if len_cur == index + 1: list_date.append(target) list_question.append(question_num) list_correct.append(answer_num) # 年月別の問題/回答数 member_correct = pd.DataFrame( {'01.year': list_date, '02.question': list_question, '03.correct': list_correct}) member_correct.to_csv("member_correct" + member + ".csv") # 問題数/正解数/正解率の最大値の取得 question_crr = 0 answer_crr = 0 for x, y in zip(list_question, list_correct): question_crr = question_crr + x answer_crr = answer_crr + y crr_ans_rate = (answer_crr / question_crr) * 100 # jupiterを日本語対応していないので、適当にタイトルを設定 if member == "るか": title = "Ruka's Question/Correct num" elif member == "みき": title = "Miki's Question/Correct num" elif member == "かな": title = "Kana's Question/Correct num" elif member == "みほ": title = "Miho's Question/Correct num" elif member == "ななせ": title = "Nanase's Question/Correct num" elif member == "せな": title = "Sena's Question/Correct num" else: title = "Rie's Question/Answer num" # X軸表示用に年月分連番を設定しておく serial_no = [index + 1 for index, i in enumerate(list_date)] # plot plt.figure(figsize=(20, 10), dpi=100, linewidth = 100) ax = plt.subplot() ax.bar(serial_no, list_question, color='#44A5CB', align="center", label="Question num") # 問題数 ax.bar(serial_no, list_correct, color='#EDAD0B', align="center", label="Correct num") # 正解数 plt.ylabel("Question/Correct num", fontsize=15) ax.legend(loc=2) # 凡例 plt.title(title, fontsize=15) plt.yticks( np.arange(0, 20, 1) ) plt.xticks(serial_no, list_date, rotation = 90) plt.savefig(member + '.png') # グラフのダウンロード plt.show() return question_crr, answer_crr, crr_ans_rate
メンバー7人で同じ処理を使うので定数化。
list_question_cum = [] list_answer_cum = [] list_crr_ans_rate = [] # るか question_ruka, answer_ruka, rate_ruka = member_answer('るか') list_question_cum.append(question_ruka) list_answer_cum.append(answer_ruka) list_crr_ans_rate.append(rate_ruka) # みき question_miki, answer_miki, rate_miki = member_answer('みき') list_question_cum.append(question_miki) list_answer_cum.append(answer_miki) list_crr_ans_rate.append(rate_miki) # かな question_kana, answer_kana, rate_kana = member_answer('かな') list_question_cum.append(question_kana) list_answer_cum.append(answer_kana) list_crr_ans_rate.append(rate_kana) # みほ question_miho, answer_miho, rate_miho = member_answer('みほ') list_question_cum.append(question_miho) list_answer_cum.append(answer_miho) list_crr_ans_rate.append(rate_miho) # ななせ question_nanase, answer_nanase, rate_nanase = member_answer('ななせ') list_question_cum.append(question_nanase) list_answer_cum.append(answer_nanase) list_crr_ans_rate.append(rate_nanase) # せな question_sena, answer_sena, rate_sena = member_answer('せな') list_question_cum.append(question_sena) list_answer_cum.append(answer_sena) list_crr_ans_rate.append(rate_sena) # りえ question_rie, answer_rie, rate_rie = member_answer('りえ') list_question_cum.append(question_rie) list_answer_cum.append(answer_rie) list_crr_ans_rate.append(rate_rie) # メンバー別の問題数/正解数/正解率 crr_ans_rate = pd.DataFrame( {'01.member': ['るか', 'みき', 'かな', 'みほ', 'ななせ', 'せな', 'りえ'], '02.question': list_question_cum, '03.answer': list_answer_cum, '04.correct_rate': list_crr_ans_rate}) crr_ans_rate.to_csv("crr_ans_rate.csv") appr_member = [1,2,3,4,5,6,7] # X軸の表示用 # plot plt.figure(figsize=(20, 10), dpi=100, linewidth = 100) ax = plt.subplot() ax.bar(appr_member, list_question_cum, color='#44A5CB', align="center", label="Question num") # 問題数 ax.bar(appr_member, list_answer_cum, color='#EDAD0B', align="center", label="Correct num") # 正解数 ax.legend(loc=2) # 凡例 plt.yticks( np.arange(0, max(list_question_cum)+3, 10) ) plt.xticks(appr_member, ['ruka', 'miki', 'kana', 'miho', 'nanase', 'sena', 'rie'], rotation = 90, fontsize=15) plt.ylabel("Question num", fontsize=15) ax2 = ax.twinx() ax2.plot(appr_member, list_crr_ans_rate, linewidth=5, marker='o', markersize=10, color='#C7243A') # 正解率 plt.ylabel("Correct answer rate", fontsize=15) plt.savefig('Correct answer rate.png') # グラフのダウンロード plt.show()