Code: Select all
# -*- encoding: UTF-8 -*-
import os, sys, numpy, itertools
from bs4 import BeautifulSoup
# Obtain file paths and polling station IDs
def file_paths():
rootdir = u'/Users/mbp/Desktop/Out/VCIK'
for root, dirs, files in os.walk(rootdir):
for file in files:
if not (os.path.splitext(file)[0] in dirs) and file!='.DS_Store':
filepath = os.path.join(root, file)
# remove rootdir from name <.replace(rootdir, '')>
# remove another level from name <split('/')[2:]>
# remove .html, and finally join everything back into a string with / separators
id = '/'.join(filepath.replace(rootdir, '').split('/')[2:]).replace(u'.html', '')
yield filepath, id
# Parse initial file
def parse_init(filepath, id):
source = open(filepath, 'r').read()
soup = BeautifulSoup(source, from_encoding="cp1251", )
# create headers (categories)
data = [[u'id']]
table = soup.findAll('table')[8]
for j,row in enumerate(table.findAll('tr')):
element = row.findAll('td')[1]
if element:
data.append([element.text.strip().split('\n', 1)[0]])
# populate table with figures
table = soup.findAll('table')[9]
for j,row in enumerate(table.findAll('tr')):
cols = row.find_all('td')
for element in cols:
if element:
if j == 0: data[0].extend([id])
data[j+1].extend([element.text.strip().split('\n', 1)[0]])
return data
# Parse remaining files
def parse(filepath, id, data):
source = open(filepath, 'r').read()
soup = BeautifulSoup(source, from_encoding="cp1251", )
table_check = soup.findAll('table')[8]
table = soup.findAll('table')[9]
for j,row in enumerate(table.findAll('tr')):
cols = row.find_all('td')
# check if headers (categories) match, and populate table with figures
if data[j+1][0] == table_check.findAll('tr')[j].findAll('td')[1].text.strip():
for element in cols:
if element:
if j == 0: data[0].extend([id])
data[j+1].extend([element.text.strip().split('\n', 1)[0],])
else:
print("ERRROR! ", filename)
return data
#transpose data set
def transpose(d):
trans = map(list, itertools.izip_longest(*d, fillvalue='-'))
return trans
# write out to tab delimited file
def write_out (data):
for j in range(len(data)):
for k in range(len(data[j])):
if data[j][k]:
output.write(data[j][k].encode('utf8'))
output.write(u"\u0009")
output.write(u"\u000D")
data = [] #dataframe
init_flag = 1 #initialization flag
files = list(file_paths()) #files to process
for i, (filepath, id) in enumerate(files):
print(id)
if init_flag:
data = parse_init(filepath, id)
init_flag = 0
else:
data = parse(filepath, id, data)
if data:
data = transpose (data)
output = open('/Users/mbp/Desktop/elections_2016.txt', 'w')
write_out(data)
Code: Select all
# -*- encoding: UTF-8 -*-
import re, urllib, os
root_url = 'http://www.vybory.izbirkom.ru/region/region/izbirkom?action=show&root=1&tvd=100100067795854&vrn=100100067795849®ion=0&global=true&sub_region=99&prver=0&pronetvd=0&vibid=100100067795854&type=233'
def escape_filename(name):
for bad_fn_char in '%' + "/\\:*?<>|":
if bad_fn_char in name:
name = name.replace(bad_fn_char, '%02x' % ord(bad_fn_char))
return name
download_counter = 0
download_log = open('download.log', 'w')
def get_page(url, path, name):
page = None
if os.path.exists(get_save_name(path, name)):
f = open(get_save_name(path, name), 'rb')
try:
page = f.read()
if page.rstrip().endswith('</html>'):
return page
else:
print 'warning, cached page is not valid html', url, repr(get_save_name(path, name))
page = None
finally:
f.close()
if page == None:
global download_counter
download_counter += 1
print >> download_log, download_counter, url
return urllib.urlopen(url).read()
def get_sub(page, url_diagnostic):
subs = []
#--------------------- combobox ------------
combobox_text = re.findall('<select name="gs">(.*?)</select>', page)
assert len(combobox_text) <= 1
if len(combobox_text) > 0:
for m in re.finditer('<option value="(.*?)">(.*?)</option>', combobox_text[0]):
url, name = (m.group(1).strip().replace('&', '&'), m.group(2).decode('cp1251').strip())
if url:
subs.append((url, name))
#-------------- site ref ------------
m = re.search(u'перейдите на < a href = "(.*?)" > сайт'.encode('cp1251').replace(' ', '\s*'), page)
if m:
url = m.group(1).strip().replace('&', '&')
name = None
for tr in re.finditer('<tr (.|\n)*?</tr>', page):
if u'Наименование Избирательной комиссии'.encode('cp1251') in tr.group(0):
name_match = re.search(
u' <b> Наименование Избирательной комиссии </b> </td> <td> (.*) </td> </tr> '.replace(' ', '\s*').encode('cp1251'),
tr.group(0))
assert name_match, url_diagnostic
name = name_match.group(1).strip().decode('cp1251')
assert name, url_diagnostic
subs.append((url, name))
return subs
#TODO: see site.
def get_save_name(path, name):
return path + '/' + name + '.html'
def save(page, path, name):
fn = get_save_name(path, name)
if not os.path.isdir(os.path.dirname(fn)):
os.makedirs(os.path.dirname(fn))
f = open(fn, 'wb')
try:
f.write(page)
finally: #py 2.5 compat
f.close()
filenames = dict()
url_to_filename_file = open('url_2_filename.txt', 'wb')
def recursive_download(url, name, path, print_progress):
page = get_page(url, path, name)
if get_save_name(path, name) in filenames:
print 'warning, duplicated filename %s from %s and %s' % (repr(get_save_name(path, name)), url, filenames[get_save_name(path, name)])
else:
save(page, path, name)
filenames[get_save_name(path, name)] = url
print >> url_to_filename_file, url, get_save_name(path, name).encode('UTF-8')
sub_pages = get_sub(page, url)
for i, (sub_url, sub_name) in enumerate(sub_pages):
if print_progress:
print '%s/%s - %s' % (i + 1, len(sub_pages), sub_name.encode('UTF-8'))
#print sub_url, sub_name.encode('UTF-8')
sub_name = escape_filename(sub_name)
recursive_download(sub_url, sub_name, path + '/' + name, print_progress = False)
recursive_download(root_url, 'VCIK', './out/', print_progress = True)
url_to_filename_file.close()