# -*- coding: utf-8 -*-
import urllib
import urllib2
import re
import sys
import traceback
import itertools
import datetime
import time
import codecs
import os
import os.path
import multiprocessing
import glob
def _get_page(url):
attempts = 20
to = 60
sleep = 60
for i in xrange(attempts):
#print 'retrieving', url, '...'
try:
resp = urllib2.urlopen(url, timeout = to)
content = resp.read()
#print 'done'
return unicode(content, 'windows-1255')
except urllib2.URLError, e:
sys.stderr.write('Failed to fetch %s: %s\n' % (url, str(e)))
time.sleep(sleep)
raise urllib2.URLError('Failed to fetch %s' % url)
def _get_res(url, fname):
attempts = 20
to = 240
sleep = 60
for i in xrange(attempts):
print 'retrieving', url, 'to', fname, '...'
try:
urllib.urlretrieve(url, fname)
print 'done'
return
except urllib2.URLError, e:
sys.stderr.write('Failed to fetch %s: %s\n' % (url, str(e)))
time.sleep(sleep)
sys.stderr.write('Failed to fetch %s\n' % url)
def _num_main_pages():
url = 'http://www.bhol.co.il/forums/forum.asp?forum_id=1364'
content = _get_page(url)
n = max([int(d) for d in _num_main_pages._main_pages_re.findall(content)])
#print 'altogether %d main pages' % n
return n
_num_main_pages._main_pages_re = re.compile(ur'<option value="(\d+?)">', re.UNICODE)
def _text_to_clusters(text):
return [(int(m[0]), m[1], m[2]) for m in _text_to_clusters._cluster_re.findall(text)]
_text_to_clusters._cluster_re = re.compile(ur'<a href="topic.asp\?.*?cat_id=24&topic_id=(\d+?)" class="par2" style="dir: rtl">(.+?)</a></span><br /><span class="par2" style="font-size:8pt;"> מחבר: <a href="usercard.asp\?uid=.+?">(.+?)</a></span></td>', re.UNICODE)
def _main_page_clusters(page_num):
url = 'http://www.bhol.co.il/forums/forum.asp?page=%d&forum_id=1364' % page_num
try:
cs = _text_to_clusters( _get_page(url) )
print '%s clusters in %s' % (str([c[0] for c in cs]), url)
return cs
except Exception, e:
sys.stderr.write('Failed to find clusters in %s\n' % url)
raise
def _all_clusters():
for page_num in xrange(1, 1 + _num_main_pages()):
for (num, title, auth) in _main_page_clusters(page_num):
yield (num, title, auth)
def _num_pages_in_cluster(content):
found = [int(d) for d in _num_pages_in_cluster._pages_re.findall(content)]
n = max(found) if found else 1
print 'altogether %d pages' % n
return n
_num_pages_in_cluster._pages_re = re.compile(ur'<option value=\'(\d+?)\'>', re.UNICODE)
_user_re = re.compile(
ur'menubar\d+?\.addItem\("1", "(.+?)", "", "", true, null, "user\d+?"\);')
def _parse_seg(url, i, seg):
auth = _user_re.search(seg).groups()[0]
(day, month, year, hour, sec) = (int(i) for i in _parse_seg._sent_in_re.search(seg).groups())
dt = datetime.datetime(year, month, day, hour, sec)
for re in [_parse_seg._wo_title_msg_start_re, _parse_seg._w_title_msg_start_re]:
m = re.search(seg)
if m:
seg = m.groups()[0]
seg = seg[: seg.find(_parse_seg._msg_end)]
_seg_res(seg)
break
else:
sys.stderr.write('Failed to find segment in %s %d\n' % (url, i))
seg = 'ERROR'
return (auth, dt, seg)
_parse_seg._sent_in_re = re.compile(ur'<td align=\'right\' width=\'60%\' style=\'padding-right: 4px; font-size: 8pt; color: gray\'>נשלח ב-(\d+)/(\d+)/(\d+) (\d+):(\d+)</td>', re.UNICODE)
_parse_seg._w_title_msg_start_re = re.compile(ur'</u></b></font><br /><font size=\'2\'><b></b></font><p><font size=\'2\'>(.*)',
re.UNICODE | re.DOTALL)
_parse_seg._wo_title_msg_start_re = re.compile(ur'<b>\s?<u>\s?</u>\s?</b>\s?</font>\s?<br\s?/>\s?<font size=\'2\'>\s?<b>\s?</b>\s?</font>\s?<p>\s?<font size=\'2\'>(.*)', re.UNICODE | re.DOTALL)
_parse_seg._msg_end = u'</font><br /></p><br><br><table width="100%"><tr><td align="left"><a align="left" href=":void(0);" =":openWindow2(\'report.asp?'
def _page_msgs(url, content):
#print 'parsing page messages %s' % url
try:
starts = [m.start() for m in _user_re.finditer(content)] + [len(content)]
print 'found %d messages' % (len(starts) - 1)
for (i, b, e) in itertools.izip(xrange(len(starts) - 1), starts[: -1], starts[1 :]):
yield _parse_seg(url, i, content[b : e])
except Exception, e:
sys.stderr.write('Problem: %s in %s\n' % (url, str(e)))
traceback.print_exc(file = sys.stderr)
raise
def _seg_res(content):
for url in _seg_res._img_re.findall(content):
m = _seg_res._shitty_convention_re.match(url)
if not m:
continue
if m.groups()[0] != m.groups()[1]:
continue
fname = url.replace(':', '_colon_').replace(' ', '_space_').replace('/', '_slash_')
_get_res(url, fname)
_seg_res._img_re = re.compile(ur'<img.*? src="(.+?)".*?/>', re.UNICODE)
_seg_res._shitty_convention_re = re.compile('.*_(.*)\.(.*)', re.UNICODE)
def _cluster_msgs(url):
content = _get_page(url)
for i in xrange(1, 1 + _num_pages_in_cluster(content)):
page_url = '%s?whichpage=%d&%s' % (url.split('?')[0], i, url.split('?')[1])
if i > 1:
content = _get_page(page_url)
for c in _page_msgs(page_url, content):
yield c
def cluster_worker(q):
while True:
num_title_auth = q.get()
if num_title_auth is None:
return
num, title, auth = num_title_auth
url ='http://www.bhol.co.il/forums/topic.asp?cat_id=24&topic_id=%d&forum_id=1364' % num
try:
for (auth_, dt, seg) in _cluster_msgs(url):
with codecs.open('scrape.txt', 'a', encoding = 'utf-8') as f:
seg = [c for c in seg if u'א' <= c <= u'ת']
f.write(str(num)+ '$' + title+ '$' + auth+ '$' + auth_+ '$' + str(dt)+ '$' + str(len(seg)) + '\n')
except Exception, e:
sys.stderr.write('Problem in %d: %s\n' % (num, str(e)))
if __name__ == '__main__':
os.remove('scrape.txt')
num_workers = multiprocessing.cpu_count() * 8
#num_workers = 1
workers = []
q = multiprocessing.Queue()
for i in xrange(num_workers):
w = multiprocessing.Process(target = cluster_worker, args = (q, ))
w.start()
workers.append(w)
for (num, title, auth) in _all_clusters():
q.put((num, title, auth))
for i in xrange(num_workers):
q.put(None)
for worker in workers:
worker.join()