ソースコード
結果の収集に使ったソースコード
import pyquery
import time
import requests
banned = set(['http://www.4gamer.net/games/074/G007497/20120713064/', 'http://horiemon.com/talk/11346/', 'https://twitter.com/', 'https://www.google.co.jp'])
def getQiitaCalendarList(year, page):
calendar_list = pyquery.PyQuery(url='http://qiita.com/advent-calendar/{}/calendars?page={}'.format(year, page))
calendars = set()
for elm in calendar_list.find('.adventCalendarList_calendarTitle > a'):
a = pyquery.PyQuery(elm)
href = a.attr('href')
calendars.add((href[22:], a.text()))
return calendars
def getQiitaArticles(name):
calendar = pyquery.PyQuery(url='http://qiita.com/advent-calendar/2016/{}'.format(name))
article = set()
for elm in calendar.find('.adventCalendarItem_entry > a'):
a = pyquery.PyQuery(elm)
url = a.attr('href')
if 'http' in url and url not in banned:
article.add(url)
return article
def getAdventarCalendarList(year):
calendar_list = pyquery.PyQuery(url='http://www.adventar.org/calendars?year={}'.format(year))
urls = set()
for elm in calendar_list.find('.mod-calendarList-title > a'):
a = pyquery.PyQuery(elm)
href = 'http://www.adventar.org' + a.attr('href')
urls.add((href, a.text()))
return urls
def getAdventarArticles(url):
calendar = pyquery.PyQuery(url=url)
article = set()
for elm in calendar.find('.mod-entryList-url > a'):
a = pyquery.PyQuery(elm)
if 'http' in a.text() and a.attr('href') not in banned:
url = a.attr('href')
article.add(url)
return article
def getHatenaBookmarkCount(urls):
assert len(urls) <= 50
try:
return requests.get('http://api.b.st-hatena.com/entry.counts', params={'url': urls}).json()
except:
time.sleep(2)
return requests.get('http://api.b.st-hatena.com/entry.counts', params={'url': urls}).json()
if __name__ == '__main__':
result = []
result_article = []
calendars = set()
for i in range(1, 27):
calendars |= getQiitaCalendarList(2016, i)
time.sleep(1)
for name, title in calendars:
articles = getQiitaArticles(name)
calendar_url = 'http://qiita.com/advent-calendar/2016/' + name
urls = [calendar_url] + list(articles)
hatebu_count = getHatenaBookmarkCount(urls)
result.append((sum(hatebu_count.values()), calendar_url, title))
result_article.extend([(url, count, calendar_url, title) for url, count in hatebu_count.items()])
time.sleep(1)
calendars = getAdventarCalendarList(2016)
time.sleep(1)
for calendar_url, title in calendars:
articles = getAdventarArticles(calendar_url)
urls = [calendar_url] + list(articles)
hatebu_count = getHatenaBookmarkCount(urls)
result.append((sum(hatebu_count.values()), calendar_url, title))
result_article.extend([(url, count, calendar_url, title) for url, count in hatebu_count.items()])
time.sleep(1)
result.sort(reverse=True)
print u'|*順位|*カレンダー名|*はてなブックマーク数|'.encode('utf-8')
for i, (count, name, title) in enumerate(result[:100], 1):
print u'|{0}|<a href="{1}">{2}</a>|{3}|'.format(i, name, title, count).encode('utf-8')
result_article.sort(key=lambda x: -x[1])
print
print u'|*順位|*記事名|*はてなブックマーク数|*カレンダー名|'.encode('utf-8')
for i, (url, count, calendar_url, title) in enumerate(result_article[:100], 1):
print u'|{0}|<a href="{1}">{2}</a>|{3}|<a href="{4}">{5}</a>|'.format(i, url, pyquery.PyQuery(url=url).find('title').text().replace('|', '|'), count, calendar_url, title).encode('utf-8')
time.sleep(1)