import sys
import os
import re
import locale
import htmlentitydefs
class Siteinfo:
def __init__(self, text):
case = re.sub('(?s).*<case>(.*?)</case>.*', '\\1', text)
self.first_letter_case = case != 'first-letter'
self.namespace = {}
# FIXME: this assume namespace 0 is unnamed
for i in re.finditer('.*<namespace key="(\d+)">(.*)</namespace>', text):
self.namespace[i.group(2) + ':'] = int(i.group(1))
def read_header():
header = ''
for line in sys.stdin:
header += line
if line.endswith('</siteinfo>\n'):
return header
def build_regex_from_list(lst):
regex = '('
for v in lst:
regex += v + '|'
regex = regex[:-1]
regex += ')'
return regex
def capitalize(s):
s = unicode(s, 'utf-8')
s = s[0].upper() + s[1:]
return s.encode('utf-8', 'replace')
def replace_html_entity(matchobj):
return unichr(htmlentitydefs.name2codepoint[matchobj.group(1)])
html_entities = build_regex_from_list(htmlentitydefs.entitydefs.keys())
html_entity = re.compile(u'&' + html_entities + u';')
def replace_entity(text):
# first translate codepoint else &#996; will be translated
#text = re.sub(u'&#(\d+|x[0-9a-fA-F]+);', replace_codepoint, text)
text = html_entity.sub(replace_html_entity, unicode(text, 'utf-8'))
return text.encode('utf-8', 'replace')
def split_title(title, siteinfo):
if title.startswith(':'):
title = title[1:]
namespace = title.split(':')
namespace = capitalize(namespace[0])
if namespace + ':' in siteinfo.namespace:
namespace = namespace.replace(' ', '_')
title = title[len(namespace + ':'):]
else:
namespace = 'Article'
if len(title) == 0:
return ('', '')
title = title.split('#')[0]
title = capitalize(title)
title = title.strip(' ')
title = title.replace(' ', '_')
title = title.replace('/', '%2F')
title = replace_entity(title)
return (namespace, title)
def out_text(fd, text):
text = replace_entity(text)
print >> fd, text,
def create_symlink(last_namespace, last_title, namespace, title):
if len(title) >= 1:
# ignore self-redirect.
if last_namespace != namespace or last_title != title:
os.remove('fr/' + last_namespace + '/' + last_title)
if last_namespace == namespace:
os.symlink(title, 'fr/' + last_namespace + '/' + last_title)
else:
os.symlink('../' + namespace + '/' + title, 'fr/' + last_namespace + '/' + last_title)
end_text = re.compile('(.*)</text>')
start_text = re.compile('.*<text xml:space="preserve">(.*)')
title_text = re.compile('<title>(.*)</title>')
redirect_text = re.compile(r'\s*#\s*REDIRECT[^]]*\[\[([^]]*)\]\].*', re.I)
def parse_xml(f, siteinfo):
in_article = False
count = 0
for line in f:
match = title_text.search(line)
if match:
count += 1
if count % 256 == 0:
print >> sys.stderr, str(count) + '\r',
title = match.group(1)
(last_namespace, last_title) = split_title(title, siteinfo)
if last_title != '':
fd = open('fr/' + last_namespace + '/' + last_title, 'w')
else:
fd = None
match = start_text.search(line)
if match and last_title != '':
text = match.group(1)
in_article = True
match = redirect_text.search(text)
if match:
(namespace, title) = split_title(match.group(1), siteinfo)
create_symlink(last_namespace, last_title, namespace, title)
in_article = False
continue
match2 = end_text.search(text)
if match2:
text = match2.group(1)
in_article = False
out_text(fd, text)
if match2:
fd = None
else:
match = end_text.search(line)
if match and in_article:
out_text(fd, match.group(1))
in_article = False
fd = None
elif in_article:
out_text(fd, line,);
def extract_files():
locale.setlocale(locale.LC_CTYPE, 'fr_FR.utf8')
locale.setlocale(locale.LC_COLLATE, 'fr_FR.utf8')
header = read_header()
siteinfo = Siteinfo(header)
os.mkdir('fr')
for f in siteinfo.namespace:
os.mkdir('fr' + '/' + f[:-1].replace(' ', '_'))
os.mkdir('fr' + '/' + 'Article')
parse_xml(sys.stdin, siteinfo)
if __name__ == "__main__":
extract_files()
#import profile
#prof = profile.Profile()
#try:
# prof.run('extract_files()')
#finally:
# prof.print_stats()