diff --git a/README.mdown b/README.mdown index c3754bd..f919e4b 100644 --- a/README.mdown +++ b/README.mdown @@ -2,68 +2,74 @@ Mobi Python Library =================== **This should be considered alpha quality software.** +**Note: Current iteration of software dumps data as html _without_ Unicode support. Current task is to convert Unicode characters to plain text** + This library provides a little API for accessing the contents of an unencrypted .mobi file. Here's a short example: - from mobi import Mobi +```python +from mobi import Mobi - book = Mobi("test/CharlesDarwin.mobi"); - book.parse(); +book = Mobi("test/CharlesDarwin.mobi"); +book.parse(); - # this will print, 1 record at a time, the entire contents of the book - for record in book: - print record, +# this will print, 1 record at a time, the entire contents of the book +for record in book: + print record +``` This library provides quite a lot of access to the metadata included in any mobibook. For example, Gutenburg's Origin of the Species: - >>> pprint(book.config) - {'exth': {'header length': 356, - 'identifier': 1163416648, - 'record Count': 15, - 'records': {100: 'Charles Darwin', - 101: 'Project Gutenberg', - 105: 'Natural selection', - 106: '1999-12-01', - 109: 'Public domain in the USA.', - 112: 'http://www.gutenberg.org/files/2009/2009-h/2009-h.htm', - 201: '\x00\x00\x00\x00', - 202: '\x00\x00\x00\x01', - 203: '\x00\x00\x00\x00', - 204: '\x00\x00\x00\x01', - 205: '\x00\x00\x00\x06', - 206: '\x00\x00\x00\x02', - 207: '\x00\x00\x00)', - 300: '\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x00 \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf4\xed\xec\xbe@\x94'}}, - 'mobi': {'DRM Count': 0, - 'DRM Flags': 0, - 'DRM Offset': 4294967295, - 'DRM Size': 0, - 'EXTH flags': 80, - 'First Image index': 334, - 'First Non-book index': 329, - 'Format version': 6, - 'Full Name': 'The Origin of Species by means of Natural Selection, 6th Edition', - 'Full Name Length': 64, - 'Full Name Offset': 604, - 'Generator version': 6, - 'Has DRM': False, - 'Has EXTH Header': True, - 'Input Language': 0, - 'Language': 9, - 'Mobi type': 2, - 'Output Language': 0, - 'Start Offset': 2808, - 'Unique-ID': 4046349163, - 'header length': 232, - 'identifier': 1297039945, - 'text Encoding': 1252}, - 'palmdoc': {'Compression': 2, - 'Encryption Type': 0, - 'Unknown': 0, - 'Unused': 0, - 'record count': 327, - 'record size': 4096, - 'text length': 1336365}} - >>> +```python +>>> pprint(book.config) +{'exth': {'header length': 356, + 'identifier': 1163416648, + 'record Count': 15, + 'records': {100: 'Charles Darwin', + 101: 'Project Gutenberg', + 105: 'Natural selection', + 106: '1999-12-01', + 109: 'Public domain in the USA.', + 112: 'http://www.gutenberg.org/files/2009/2009-h/2009-h.htm', + 201: '\x00\x00\x00\x00', + 202: '\x00\x00\x00\x01', + 203: '\x00\x00\x00\x00', + 204: '\x00\x00\x00\x01', + 205: '\x00\x00\x00\x06', + 206: '\x00\x00\x00\x02', + 207: '\x00\x00\x00)', + 300: '\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x00 \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf4\xed\xec\xbe@\x94'}}, + 'mobi': {'DRM Count': 0, + 'DRM Flags': 0, + 'DRM Offset': 4294967295, + 'DRM Size': 0, + 'EXTH flags': 80, + 'First Image index': 334, + 'First Non-book index': 329, + 'Format version': 6, + 'Full Name': 'The Origin of Species by means of Natural Selection, 6th Edition', + 'Full Name Length': 64, + 'Full Name Offset': 604, + 'Generator version': 6, + 'Has DRM': False, + 'Has EXTH Header': True, + 'Input Language': 0, + 'Language': 9, + 'Mobi type': 2, + 'Output Language': 0, + 'Start Offset': 2808, + 'Unique-ID': 4046349163, + 'header length': 232, + 'identifier': 1297039945, + 'text Encoding': 1252}, + 'palmdoc': {'Compression': 2, + 'Encryption Type': 0, + 'Unknown': 0, + 'Unused': 0, + 'record count': 327, + 'record size': 4096, + 'text length': 1336365}} +>>> +``` ## Retrieving Author and Title The author and title of a book can be retrieved using the author() and title() diff --git a/example.py b/example.py index 4b3cfd6..2ae51fc 100644 --- a/example.py +++ b/example.py @@ -1,10 +1,11 @@ from mobi import Mobi +from pprint import pprint -book = Mobi("test/CharlesDarwin.mobi"); -book.parse(); +if __name__ == '__main__': + book = Mobi("test/CharlesDarwin.mobi"); + book.parse(); -for record in book: - print record, - -import pprint -pprint.pprint(book.config) \ No newline at end of file + for record in book: + # this prints the entire book out to the console + # it can be piped to an html file + print record diff --git a/mobi/__init__.py b/mobi/__init__.py index 287ed87..864e811 100644 --- a/mobi/__init__.py +++ b/mobi/__init__.py @@ -8,33 +8,51 @@ """ import sys -import os -import unittest from struct import * -from pprint import pprint -import utils from lz77 import uncompress_lz77 class Mobi: def parse(self): """ reads in the file, then parses record tables""" - self.contents = self.f.read(); - self.header = self.parseHeader(); - self.records = self.parseRecordInfoList(); - self.readRecord0() + self.contents = self.f.read() + self.header = self.parseHeader() + self.records = self.parseRecordInfoList() + self.config = self.populate_config() def readRecord(self, recordnum, disable_compression=False): - if self.config: - if self.config['palmdoc']['Compression'] == 1 or disable_compression: - return self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']]; - elif self.config['palmdoc']['Compression'] == 2: - result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']]) - return result + compressionType = self.config['palmdoc']['Compression'] + + try: + start = self.records[recordnum]['record Data Offset'] + + # @TODO offset by record is not always 1 + # the correct record offset can be determined by examining + # `book.records` + end = self.records[recordnum + 1]['record Data Offset'] + except KeyError, e: + sys.stderr.write('Could not find key value: %s\n' % str(e)) + return + + # @TODO configuration not present should run configurator. + if not self.config: + return + + if (compressionType == 1 or disable_compression): + return self.contents[start : end] + + elif (compressionType == 2): + extra = self.config['mobi']['extra bytes'] + result = uncompress_lz77(self.contents[start : end - extra]) + return result + else: + sys.stderr.write('Error: could not recognize compression type "%s".' \ + % str(compressionType)) + exit(1) def readImageRecord(self, imgnum): if self.config: - recordnum = self.config['mobi']['First Image index'] + imgnum; - return self.readRecord(recordnum, disable_compression=True); + recordnum = self.config['mobi']['First Image index'] + imgnum + return self.readRecord(recordnum, disable_compression=True) def author(self): "Returns the author of the book" @@ -44,56 +62,61 @@ def title(self): "Returns the title of the book" return self.config['mobi']['Full Name'] -########### Private API ########################### +########################### Private API ########################### def __init__(self, filename): try: + # not sure if explicit type checking is the best way to do this. if isinstance(filename, str): - self.f = open(filename, "rb"); + self.f = open(filename, "rb") else: - self.f = filename; - except IOError,e: - sys.stderr.write("Could not open %s! " % filename); - raise e; - self.offset = 0; + self.f = filename + except IOError, e: + sys.stderr.write("Could not open %s! " % filename) + raise e + self.offset = 0 def __iter__(self): - if not self.config: return; + # @TODO configuration not present should run configurator. + if not self.config: + return + for record in range(1, self.config['mobi']['First Non-book index'] - 1): - yield self.readRecord(record); + yield self.readRecord(record) def parseRecordInfoList(self): - records = {}; + records = {} + # read in all records in info list for recordID in range(self.header['number of records']): - headerfmt = '>II' - headerlen = calcsize(headerfmt) fields = [ "record Data Offset", - "UniqueID", + "UniqueID" ] + + headerfmt = '>II' + headerlen = calcsize(headerfmt) + infolist = self.contents[self.offset : self.offset + headerlen] + # create tuple with info - results = zip(fields, unpack(headerfmt, self.contents[self.offset:self.offset+headerlen])) + results = dict(zip(fields, unpack(headerfmt, infolist))) # increment offset into file self.offset += headerlen - # convert tuple to dictionary - resultsDict = utils.toDict(results); + # futz around with the unique ID record, as the uniqueID's top 8 bytes + # are really the "record attributes": + results['record Attributes'] = \ + (results['UniqueID'] & 0xFF000000) >> 24 - # futz around with the unique ID record, as the uniqueID's top 8 bytes are - # really the "record attributes": - resultsDict['record Attributes'] = (resultsDict['UniqueID'] & 0xFF000000) >> 24; - resultsDict['UniqueID'] = resultsDict['UniqueID'] & 0x00FFFFFF; + results['UniqueID'] = results['UniqueID'] & 0x00FFFFFF # store into the records dict - records[resultsDict['UniqueID']] = resultsDict; + records[results['UniqueID']] = results - return records; + return records def parseHeader(self): - headerfmt = '>32shhIIIIII4s4sIIH' - headerlen = calcsize(headerfmt) fields = [ "name", "attributes", @@ -111,34 +134,40 @@ def parseHeader(self): "number of records" ] + headerfmt = '>32shhIIIIII4s4sIIH' + headerlen = calcsize(headerfmt) + header = self.contents[self.offset : self.offset + headerlen] + # unpack header, zip up into list of tuples - results = zip(fields, unpack(headerfmt, self.contents[self.offset:self.offset+headerlen])) + results = dict(zip(fields, unpack(headerfmt, header))) # increment offset into file self.offset += headerlen - # convert tuple array to dictionary - resultsDict = utils.toDict(results); - - return resultsDict + return results - def readRecord0(self): - palmdocHeader = self.parsePalmDOCHeader(); - MobiHeader = self.parseMobiHeader(); + # this function will populate the self.config attribute + def populate_config(self): + palmdocHeader = self.parsePalmDOCHeader() + MobiHeader = self.parseMobiHeader() exthHeader = None - if MobiHeader['Has EXTH Header']: - exthHeader = self.parseEXTHHeader(); + if (MobiHeader['Has EXTH Header']): + exthHeader = self.parseEXTHHeader() - self.config = { + config = { 'palmdoc': palmdocHeader, 'mobi' : MobiHeader, 'exth' : exthHeader } + return config + def parseEXTHHeader(self): headerfmt = '>III' headerlen = calcsize(headerfmt) + header = self.contents[self.offset:self.offset + headerlen] + fields = [ 'identifier', 'header length', @@ -146,20 +175,24 @@ def parseEXTHHeader(self): ] # unpack header, zip up into list of tuples - results = zip(fields, unpack(headerfmt, self.contents[self.offset:self.offset+headerlen])) + results = dict(zip(fields, unpack(headerfmt, header))) + + self.offset += headerlen + + results['records'] = {} - # convert tuple array to dictionary - resultsDict = utils.toDict(results); + for record in range(results['record Count']): - self.offset += headerlen; - resultsDict['records'] = {}; - for record in range(resultsDict['record Count']): - recordType, recordLen = unpack(">II", self.contents[self.offset:self.offset+8]); - recordData = self.contents[self.offset+8:self.offset+recordLen]; - resultsDict['records'][recordType] = recordData; - self.offset += recordLen; + recordType, recordLen = \ + unpack(">II", self.contents[self.offset : self.offset + 8]) + + recordData = \ + self.contents[self.offset + 8 : self.offset+recordLen] - return resultsDict; + results['records'][recordType] = recordData + self.offset += recordLen + + return results def parseMobiHeader(self): headerfmt = '> IIII II 40s III IIIII IIII I 36s IIII 8s HHIIIII' @@ -211,34 +244,39 @@ def parseMobiHeader(self): "Unknown" ] - # unpack header, zip up into list of tuples - results = zip(fields, unpack(headerfmt, self.contents[self.offset:self.offset+headerlen])) + header = self.contents[self.offset:self.offset+headerlen] - # convert tuple array to dictionary - resultsDict = utils.toDict(results); + # unpack header, zip up into list of tuples + results = dict(zip(fields, unpack(headerfmt, header))) - resultsDict['Start Offset'] = self.offset; + results['Start Offset'] = self.offset - resultsDict['Full Name'] = (self.contents[ - self.records[0]['record Data Offset'] + resultsDict['Full Name Offset'] : - self.records[0]['record Data Offset'] + resultsDict['Full Name Offset'] + resultsDict['Full Name Length']]) + results['Full Name'] = (self.contents[ + self.records[0]['record Data Offset'] + results['Full Name Offset'] : + self.records[0]['record Data Offset'] + \ + results['Full Name Offset'] + results['Full Name Length']]) - resultsDict['Has DRM'] = resultsDict['DRM Offset'] != 0xFFFFFFFF; + results['Has DRM'] = results['DRM Offset'] != 0xFFFFFFFF - resultsDict['Has EXTH Header'] = (resultsDict['EXTH flags'] & 0x40) != 0; + results['Has EXTH Header'] = (results['EXTH flags'] & 0x40) != 0 - self.offset += resultsDict['header length']; + self.offset += results['header length'] def onebits(x, width=16): - return len(filter(lambda x: x == "1", (str((x>>i)&1) for i in xrange(width-1,-1,-1)))); + # Remove reliance on xrange()? + return len(filter(lambda x: x == "1", + (str((x>>i)&1) for i in xrange(width - 1, -1, -1)))) - resultsDict['extra bytes'] = 2*onebits(unpack(">H", self.contents[self.offset-2:self.offset])[0] & 0xFFFE) + results['extra bytes'] = \ + 2 * onebits( + unpack(">H", self.contents[self.offset - 2 : self.offset])[0] & 0xFFFE) - return resultsDict; + return results def parsePalmDOCHeader(self): headerfmt = '>HHIHHHH' headerlen = calcsize(headerfmt) + fields = [ "Compression", "Unused", @@ -248,39 +286,11 @@ def parsePalmDOCHeader(self): "Encryption Type", "Unknown" ] - offset = self.records[0]['record Data Offset']; - # create tuple with info - results = zip(fields, unpack(headerfmt, self.contents[offset:offset+headerlen])) - - # convert tuple array to dictionary - resultsDict = utils.toDict(results); - - self.offset = offset+headerlen; - return resultsDict - -class MobiTests(unittest.TestCase): - def setUp(self): - self.mobitest = Mobi("../test/CharlesDarwin.mobi"); - def testParse(self): - self.mobitest.parse(); - pprint (self.mobitest.config) - def testRead(self): - self.mobitest.parse(); - content = "" - for i in range(1,5): - content += self.mobitest.readRecord(i); - def testImage(self): - self.mobitest.parse(); - pprint (self.mobitest.records); - for record in range(4): - f = open("imagerecord%d.jpg" % record, 'w') - f.write(self.mobitest.readImageRecord(record)); - f.close(); - def testAuthorTitle(self): - self.mobitest.parse() - self.assertEqual(self.mobitest.author(), 'Charles Darwin') - self.assertEqual(self.mobitest.title(), 'The Origin of Species by means '+ - 'of Natural Selection, 6th Edition') - -if __name__ == '__main__': - unittest.main() + + offset = self.records[0]['record Data Offset'] + + header = self.contents[offset:offset+headerlen] + results = dict(zip(fields, unpack(headerfmt, header))) + + self.offset = offset+headerlen + return results