ソースコード
import pyquery
import requests
import time
import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.font_manager
import matplotlib.cm as cm
cf_rating = {}
cf_data = requests.get('http://codeforces.com/api/user.ratedList').json()
for i in cf_data['result']:
cf_rating[i['handle']] = i['rating']
rating_atcoder = []
counts = []
rating_codeforces = []
for i in xrange(1, 29):
table = pyquery.PyQuery(url='https://atcoder.jp/ranking?p={}'.format(i))
for elm in table.find('tr')[1:]:
tr = pyquery.PyQuery(elm)
tds = tr.find('td')
rank = int(pyquery.PyQuery(tds[0]).text())
name = pyquery.PyQuery(tds[1]).text()
rating = int(pyquery.PyQuery(tds[2]).text())
count = int(pyquery.PyQuery(tds[4]).text())
cf = cf_rating.get(name, np.nan)
print rank, name, rating, cf, count
rating_atcoder.append(rating)
counts.append(count)
rating_codeforces.append(cf)
time.sleep(1)
df = pd.DataFrame({
'rating_atcoder': rating_atcoder,
'rating_codeforces': rating_codeforces,
'count': counts
})
prop = matplotlib.font_manager.FontProperties(fname=r'C:\Windows\Fonts\meiryo.ttc', size=12)
sns.plt.hist(df[df['rating_atcoder'] < 400].reset_index()['rating_atcoder'], bins=range(0, 4001, 100), histtype='stepfilled', color='#808080')
sns.plt.hist(df[(400 <= df['rating_atcoder']) & (df['rating_atcoder'] < 800)].reset_index()['rating_atcoder'], bins=range(0, 4001, 100), histtype='stepfilled', color='#804000')
sns.plt.hist(df[(800 <= df['rating_atcoder']) & (df['rating_atcoder'] < 1200)].reset_index()['rating_atcoder'], bins=range(0, 4001, 100), histtype='stepfilled', color='#008000')
sns.plt.hist(df[(1200 <= df['rating_atcoder']) & (df['rating_atcoder'] < 1600)].reset_index()['rating_atcoder'], bins=range(0, 4001, 100), histtype='stepfilled', color='#00C0C0')
sns.plt.hist(df[(1600 <= df['rating_atcoder']) & (df['rating_atcoder'] < 2000)].reset_index()['rating_atcoder'], bins=range(0, 4001, 100), histtype='stepfilled', color='#0000FF')
sns.plt.hist(df[(2000 <= df['rating_atcoder']) & (df['rating_atcoder'] < 2400)].reset_index()['rating_atcoder'], bins=range(0, 4001, 100), histtype='stepfilled', color='#C0C000')
sns.plt.hist(df[(2400 <= df['rating_atcoder']) & (df['rating_atcoder'] < 2800)].reset_index()['rating_atcoder'], bins=range(0, 4001, 100), histtype='stepfilled', color='#FF8000')
sns.plt.hist(df[2800 <= df['rating_atcoder']].reset_index()['rating_atcoder'], bins=range(0, 4001, 100), histtype='stepfilled', color='#FF0000')
sns.plt.title(u'AtCoderのレーティングの分布', fontproperties=prop)
sns.plt.xlabel(u'AtCoderのレーティング', fontproperties=prop)
sns.plt.ylabel(u'ユーザー数', fontproperties=prop)
sns.plt.show()
print 'rating: percentile'
for i in [0, 400, 800, 1200, 1600, 2000, 2400, 2800]:
print '{}: {:.3}'.format(i, 100 - scipy.stats.percentileofscore(df['rating_atcoder'], i))
df = df.dropna()
sns.plt.scatter(df['rating_codeforces'], df['rating_atcoder'], c=df['count'], cmap=cm.gist_rainbow)
cb = sns.plt.colorbar(label=u'AtCoderの参加回数')
cb.ax.yaxis.label.set_font_properties(prop)
sns.plt.title(u'AtCoderのレーティングとCodeforcesのレーティングの関係', fontproperties=prop)
sns.plt.xlabel(u'Codeforcesのレーティング', fontproperties=prop)
sns.plt.ylabel(u'AtCoderのレーティング', fontproperties=prop)
sns.plt.show()
print df.corr('spearman')
idx = []
value = []
for i in xrange(1, 10):
idx.append(i)
value.append(df[df['count'] >= i].corr('spearman')['rating_atcoder']['count'])
import seaborn as sns
sns.plt.plot(idx, value)
sns.plt.title(u'AtCoder出場回数とAtCoderのレーティングとの相関係数の関係', fontproperties=prop)
sns.plt.xlabel(u'何回以上出場した人のデータを使って計算したか', fontproperties=prop)
sns.plt.ylabel(u'スピアマンの順位相関係数', fontproperties=prop)
sns.plt.show()
idx = []
value = []
for i in xrange(1, 11):
idx.append(i)
value.append(df[df['count'] == i].corr('spearman')['rating_atcoder']['rating_codeforces'])
import seaborn as sns
sns.plt.plot(idx, value)
sns.plt.title(u'CodeforcesのレーティングとAtCoderのレーティングの相関係数と出場回数の関係', fontproperties=prop)
sns.plt.xlabel(u'ちょうど何回出場した人のデータを使って計算したか', fontproperties=prop)
sns.plt.ylabel(u'スピアマンの順位相関係数', fontproperties=prop)
sns.plt.show()
import sklearn.linear_model
model = sklearn.linear_model.RidgeCV(alphas=[0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0], store_cv_values=True, fit_intercept=False)
import sklearn.preprocessing
enc = sklearn.preprocessing.OneHotEncoder(categorical_features=[1])
enc.fit(df[['rating_codeforces', 'count']])
model.fit(enc.transform(df[['rating_codeforces', 'count']]), df['rating_atcoder'])
print model.coef_, model.intercept_
print model.cv_values_.mean(axis=0)