分割iPod文本的Python程序

分割iPod文本的Python程序

2009 5 26 01:07 AM 2394次查看

分类：Python 标签：Python

用过iPod的都知道，超过4kb的文本就直接截断了，这直接影响能否阅读长篇小说。

于是初学Python时，我写了个iPod文本分割程序，不过为了效率和兼容性，写了大量没什么用的代码，所以一直没放出来。
最近又准备读小说了，于是将这个程序完善了一下。
精简了一些代码，其实难点就是找出最后一个字符，不能把中文分开了。如果输出不是UTF-8这种可变长度的编码的话，我想我会安逸很多。

基本功能如下：

可以将文本分割为4kb大小，按编号存放在同名文件夹中。
支持指定输入编码，默认为系统默认编码；此外专门对UTF-8编码进行了处理，其余编码可能会有些许问题。
输出编码为UTF-8。原因是如果指定GBK，则超出GB2312编码范围的字符（例如大多数的繁体字）都不能在iPod上显示。
文本开头有返回目录的链接，结尾有进入下一页的链接。如果要返回上一页，则按MENU键（向上）；如果要返回主菜单，则长按MENU键。

代码：

# -*- coding: utf8 -*-

u"""
  本程序用于分割文本文件，以方便iPod读取。

  使用方法：

  cut_ipod_file.py [路径]文件名 [编码]

  若路径和文件名中包含空格，请用双引号(")引起来。

  编码留空则采用系统默认编码。

  为避免iPod显示为乱码，输出格式为UTF-8。
"""

from __future__ import with_statement
import sys
import locale

__author__ = 'keakon'

DEFAULT_ENCODING = locale.getdefaultlocale()[1]
OUTPUT_ENCODING = 'utf8'
CUT_SIZE = 4096

def cut(filename, encoding=DEFAULT_ENCODING):
  u"""
  本函数用于分割文本文件，以方便iPod读取。

  使用方法：

  cut_ipod_file.cut(filename, encoding=locale.getdefaultlocale()[1])
  """

  import os
  import math
  import codecs
  import string
  from os import linesep
  from string import zfill


  # 检查参数

  if not filename:
    print u'未指定文件名。'
    return 1;

  try:
    filesize = os.stat(filename)[6]
  except:
    print u'文件不存在。'
    return 2;

  if filesize <= CUT_SIZE:
    print u'文件太小，无需分割。'
    return 3;


  # 创建输出文件夹

  dirname = os.path.splitext(filename)[0] # 去掉后缀名
  if (not os.path.isdir(dirname)):
    os.mkdir(dirname)
  # 不检测是否创建成功了，没权限就别运行了


  # 读取文件

  # 不设置error='ignore'，这样万一出错也知道
  with codecs.open(filename, 'rb', encoding) as fin:
    content = fin.read().encode(OUTPUT_ENCODING)
    if content.startswith(codecs.BOM_UTF8): # 去掉BOM头
      content = content[3:]
  fin.close()


  # 输出配置

  HEADER = codecs.BOM_UTF8 + '<a href=".">返回目录</a>' + linesep
  FOOTER = linesep + '<a href="%(page)s.txt">第%(page)s页</a>'

  totalSize = len(content)
  pieces = int(math.ceil(totalSize / 4000)) # 稍微留些空余字符，免得页数不够，所以用4000估算
  piecesDigit = int(math.ceil(math.log10(pieces)))

  appendSize = len(HEADER + FOOTER % {'page': '0' * piecesDigit})
  mainSize = CUT_SIZE - appendSize


  # 获取最后一个字的位置

  def getEndIndex(endString):
    length = len(endString)
    MIN_LEN = 6
    if length < MIN_LEN: # 不足6位说明文件已结束
      return 0

    def isEnd(endWord):

      def isWord(word):
        try:
          unicode(word, OUTPUT_ENCODING)
          return True
        except:
          return False

      for i in xrange(4):
        if isWord(endWord[i:]):
          return True
      else:
        return False

    for i in xrange(MIN_LEN, 3, -1):
      if isEnd(endString[i - 4:i]): # utf8字符最多4位
        return i - MIN_LEN
    else:
      return -4 # 如果最后3个字符都不是字的结尾字符，则只能是第4个


  #输出文件

  for page in xrange(pieces):

    outFilename = os.path.join(dirname, zfill(page, piecesDigit) + '.txt')

    with open(outFilename, 'wb') as fout:
      fout.write(HEADER)

      end = mainSize + getEndIndex(content[mainSize - 6: mainSize]) # 共检查6位
      fout.write(content[:end])

      content = content[end:]

      if not content:
        break
      else:
        fout.write(FOOTER % {'page': zfill(page + 1, piecesDigit)})

  print u'分割完毕。'


if __name__ == '__main__':

  argc = len(sys.argv)

  if argc < 2:
    print __doc__
    exit(0)
  elif argc == 2:
    exit(cut(sys.argv[1]))
  else:
    exit(cut(sys.argv[1], sys.argv[2]))

1条评论你不来一发么↓ 顺序排列倒序排列

向下滚动可载入更多评论，或者点这里禁止自动加载。

想说点什么呢？