A Python script for finding changes between Astlan EPUB editions:
[code=python]
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import difflib
import epub
from bs4 import BeautifulSoup
def text_paras(book, href):
soup = BeautifulSoup(book.read_item(href), ‘lxml’)
return [x.get_text() for x in soup.find_all(‘p’)]
book1 = epub.open_epub(‘Apostles of Doom Alpha 1 – J. L. Langland.epub’)
book2 = epub.open_epub(‘Apostles of Doom Alpha 2 – J. L. Langland.epub’)
for item in book1.opf.manifest.values():
print(item.href)
if item and item.media_type == ‘application/xhtml+xml’:
paras1 = text_paras(book1, item.href)
paras2 = text_paras(book2, item.href)
s = difflib.SequenceMatcher(None, paras1, paras2)
for opcode in s.get_opcodes():
print(“%6s a[%d:%d] b[%d:%d]” % opcode)
if opcode[0] == ‘insert’:
print(‘B: ‘, paras2[opcode[3]:opcode[4]])
elif opcode[0] == ‘replace’:
print(‘A: ‘, paras1[opcode[1]:opcode[2]])
print(‘B: ‘, paras2[opcode[3]:opcode[4]])
elif opcode[0] == ‘delete’:
print(‘A: ‘, paras1[opcode[1]:opcode[2]])
[/code]