initial checkin

This commit is contained in:
Satoru SATOH 2019-02-24 11:50:36 +09:00
parent a0fa804a86
commit 6621575d74
18 changed files with 22486 additions and 0 deletions

27
.travis.yml Normal file
View file

@ -0,0 +1,27 @@
# Ref. http://about.travis-ci.org/docs/user/languages/python/
language: python
python:
# Disabled for a while until fixes happen in outside of anytemplate, e.g. ordereddict, asteroid.
# - 2.6
- 2.7
- 3.4
- 3.5
- 3.6
# .. seealso:: https://github.com/travis-ci/travis-ci/issues/9815
matrix:
include:
- python: 3.7
dist: xenial
sudo: true
install:
- pip install tox-travis
script:
- tox
after_success:
- coveralls
notifications:
email:
recipients:
- satoru.satoh+github@gmail.com
on_failure: always

8
MANIFEST.in Normal file
View file

@ -0,0 +1,8 @@
include LICENSE
include MANIFEST.in
include README.*
include setup.*
include tox.ini .gitignore .travis.yml
include pkg/*
recursive-include smhtml *.py
recursive-include tests *.py

35
README.rst Normal file
View file

@ -0,0 +1,35 @@
=============
smhtml
=============
About
======
.. .. image:: https://img.shields.io/pypi/v/smhtml.svg
:target: https://pypi.python.org/pypi/smhtml/
:alt: [Latest Version]
.. .. image:: https://img.shields.io/pypi/pyversions/smhtml.svg
:target: https://pypi.python.org/pypi/smhtml/
:alt: [Python versions]
.. image:: https://api.travis-ci.org/ssato/python-smhtml.png
:target: https://travis-ci.org/ssato/python-smhtml
:alt: [Test status]
.. image:: https://coveralls.io/repos/ssato/python-smhtml/badge.png
:target: https://coveralls.io/r/ssato/python-smhtml
:alt: [Coverage Status]
.. .. image:: https://landscape.io/github/ssato/python-smhtml/master/landscape.png
:target: https://landscape.io/github/ssato/python-smhtml/master
:alt: [Code Health]
This is a simple and experimental python library to parse and dump MHTML [#]_ data.
- Author: Satoru SATOH <ssato@redhat.com>
- License: MIT
.. [#] https://ja.wikipedia.org/wiki/MHTML
.. vim:sw=2:ts=2:et:

10
pkg/nose.cfg Normal file
View file

@ -0,0 +1,10 @@
[nosetests]
verbosity=2
with-doctest=1
#all-modules=1
# Requires that nosetest processes multiple modules' test cases at once.
# processes=4
# coverage:
cover-package=smhtml
cover-branches=1

75
pkg/package.spec.in Normal file
View file

@ -0,0 +1,75 @@
%global pkgname smhtml
%if 0%{?fedora} || 0%{?rhel} > 7 || 0%{?epel} > 7
%global with_python3 1
%endif
%global desc \
A simple and experimental python library to parse and dump MHTML data.
Name: python-%{pkgname}
Version: @VERSION@
Release: 1%{?dist}
Summary: Python library to parse and dump MHTML data
Group: Development/Tools
License: MIT
URL: https://github.com/ssato/python-smhtml
Source0: %{url}/archive/RELEASE_%{version}.tar.gz
BuildArch: noarch
%if 0%{?with_python3}
BuildRequires: python3-devel
BuildRequires: python3-setuptools
%else
BuildRequires: python2-setuptools
BuildRequires: python2-devel
%endif
%description %{desc}
%if 0%{?with_python3}
%package -n python3-%{pkgname}
Summary: %{summary}
Requires: python3-chardet
%{?python_provide:%python_provide python3-%{pkgname}}
%description -n python3-%{pkgname} %{desc}
%else
%package -n python2-%{pkgname}
Summary: %{summary}
Requires: python2-chardet
%{?python_provide:%python_provide python2-%{pkgname}}
%description -n python2-%{pkgname} %{desc}
%endif
%prep
%autosetup -n %{pkgname}-%{version}
%build
%if 0%{?with_python3}
%py3_build
%else
%py2_build
%endif
%install
%if 0%{?with_python3}
%py3_install
%else
%py2_install
%endif
%if 0%{?with_python3}
%files -n python3-%{pkgname}
%{python3_sitelib}/%{pkgname}*
%else
%files -n python2-%{pkgname}
%{python_sitelib}/%{pkgname}*
%endif
%doc README.rst examples
%{_bindir}/*
%changelog
* Sun Feb 24 2019 Satoru SATOH <ssato@redhat.com> - 0.0.1-1
- Initial packaging

1
pkg/requirements.txt Normal file
View file

@ -0,0 +1 @@
chardet

View file

@ -0,0 +1,6 @@
-r requirements.txt
coveralls
flake8 < 3.5.0
nose
pycodestyle < 2.4.0
pylint

50
setup.cfg Normal file
View file

@ -0,0 +1,50 @@
# ref. https://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files
[bdist_wheel]
universal = 1
[metadata]
name = smthml
# .. todo::
# version = attr: src.VERSION
description = A module to parse and dump MHTML data
long_description = file: README.rst
author = Satoru SATOH
author_email = satoru.satoh@gmail.com
maintainer = Satoru SATOH
maintainer_email = satoru.satoh@gmail.com
url = https://github.com/ssato/python-smhtml
license = MIT
classifiers =
Development Status :: 4 - Beta
License :: OSI Approved :: MIT License
Intended Audience :: Developers
Programming Language :: Python
Programming Language :: Python :: 2
Programming Language :: Python :: 2.7
Programming Language :: Python :: 3
Programming Language :: Python :: 3.3
Programming Language :: Python :: 3.4
Programming Language :: Python :: 3.5
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
Environment :: Console
Operating System :: OS Independent
Topic :: Software Development :: Libraries :: Python Modules
Topic :: Text Processing :: Markup
Topic :: Utilities
[options]
include_package_data = True
# .. note:: It does not look worked as expected w/ older setuptools.
#package_dir =
# = src
packages = find:
[options.packages.find]
exclude =
tests
tests.*
#[options.entry_points]
#console_scripts =
# smhtml_cli = smhtml.cli:main

46
setup.py Normal file
View file

@ -0,0 +1,46 @@
from __future__ import absolute_import
import os.path
import os
import re
import sys
import setuptools
import setuptools.command.bdist_rpm
# It might throw IndexError and so on.
VERSION = [re.search(r'^VERSION = "([^"]+)"', l).groups()[0] for l
in open("smhtml/globals.py").readlines()
if "VERSION" in l][0]
# For daily snapshot versioning mode:
if os.environ.get("_SNAPSHOT_BUILD", None) is not None:
import datetime
VERSION = VERSION + datetime.datetime.now().strftime(".%Y%m%d")
class bdist_rpm(setuptools.command.bdist_rpm.bdist_rpm):
"""Override the default content of the RPM SPEC.
"""
spec_tmpl = os.path.join(os.path.abspath(os.curdir),
"pkg/package.spec.in")
def _replace(self, line):
"""Replace some strings in the RPM SPEC template"""
if "@VERSION@" in line:
return line.replace("@VERSION@", VERSION)
if "Source0:" in line: # Dirty hack
return "Source0: %{pkgname}-%{version}.tar.gz"
return line
def _make_spec_file(self):
return [self._replace(l.rstrip()) for l
in open(self.spec_tmpl).readlines()]
setuptools.setup(name="smhtml", version=VERSION,
cmdclass=dict(bdist_rpm=bdist_rpm))
# vim:sw=4:ts=4:et:

25
smhtml/__init__.py Normal file
View file

@ -0,0 +1,25 @@
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
r"""
.. module:: smhtml
:platform: Unix, Windows
:synopsis: Simple MHTML parsing library
python-smhtml is a simple and experimental MHTML parsing library for python.
- Home: https://github.com/ssato/python-smhtml
"""
from .api import (
AUTHOR, VERSION,
parse, parse_itr
)
__author__ = AUTHOR
__version__ = VERSION
__all__ = ["parse", "parse_itr"]
# vim:sw=4:ts=4:et:

23
smhtml/api.py Normal file
View file

@ -0,0 +1,23 @@
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
# pylint: disable=unused-import
r"""Public APIs of smhtml module.
.. versionadded:: 0.1
"""
from __future__ import absolute_import
from .globals import (
PACKAGE, AUTHOR, VERSION, LOGGER # flake8: noqa
)
from .parser import parse, parse_itr # flake8: noqa
def version():
""":return: Version info tuple, (major, minor, release), e.g. (0, 8, 2)
"""
return VERSION.split('.')
# vim:sw=4:ts=4:et:

16
smhtml/globals.py Normal file
View file

@ -0,0 +1,16 @@
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
"""Some globals of smhtml module.
"""
import logging
PACKAGE = "smhtml"
AUTHOR = "Satoru SATOH <satoru.satoh@gmail.com>"
VERSION = "0.0.1"
LOGGER = logging.getLogger(PACKAGE)
# vim:sw=4:ts=4:et:

112
smhtml/parser.py Normal file
View file

@ -0,0 +1,112 @@
#
# -*- coding: utf-8; mode: python -*-
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
# pylint: disable=unused-import
r"""Simple and very experimental MHTML Parser
.. versionadded:: 0.0.1
"""
from __future__ import absolute_import
import base64
import email
import mimetypes
import quopri
import chardet
def detect_charset(bmsg, default="ascii"):
r"""
:param bmsg: A byte data to detect charset
:return: A string represents charset such as 'utf-8', 'iso-2022-jp'
>>> detect_charset(b"a")
'ascii'
>>> detect_charset(b"")
'ascii'
>>> detect_charset(u"".encode("utf-8"))
'utf-8'
"""
if not bmsg:
return default
return chardet.detect(bmsg)["encoding"]
def decode_part(part):
"""
:param part: :class:`email.message.Message` object (MIME part)
"""
cte = part.get_all("Content-Transfer-Encoding")[0]
msg = part.get_payload()
ctype = part.get_content_type()
mtype = part.get_content_maintype()
if cte == "base64":
bdata = base64.b64decode(msg)
elif cte == "quoted-printable":
bdata = quopri.decodestring(msg)
else:
bdata = msg
if mtype == "text":
charset = detect_charset(bdata)
data = bdata.decode(charset, "ignore")
else:
charset = None
data = bdata
return dict(type=ctype, encoding=charset, data=data,
location=part.get_all("Content-Location"))
def get_or_gen_filename(part, idx=0):
"""
:param part: :class:`email.message.Message` object (MIME part)
"""
filename = part.get_filename()
if not filename:
fileext = mimetypes.guess_extension(part.get_content_type())
if not fileext:
fileext = ".bin"
filename = "part-%03d%s" % (idx, fileext)
return filename
def parse_itr(filepath):
"""
:param filepath: :class:`pathlib.Path` object
:return: A generator yields each part parsed from `filepath` opened
"""
with open(filepath) as fobj:
mhtml_data = email.message_from_file(fobj)
if not mhtml_data.is_multipart():
raise ValueError("Multi-part MIME data was not found in "
"'%s'" % filepath)
for idx, part in enumerate(mhtml_data.walk()):
if part.get_content_maintype() == "multipart":
continue
filename = get_or_gen_filename(part, idx=idx)
info = decode_part(part)
info["index"] = idx
info["filename"] = filename
yield info
def parse(filepath):
"""
:param filepath: :class:`pathlib.Path` object
:return: A list of parsed data
"""
return list(parse_itr(filepath))
# vim:sw=4:ts=4:et:

0
tests/__init__.py Normal file
View file

46
tests/common.py Normal file
View file

@ -0,0 +1,46 @@
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh at gmail.com>
# License: MIT
#
# pylint: disable=missing-docstring
from __future__ import absolute_import
import difflib
import os.path
import os
import tempfile
def selfdir():
"""
:return: module path itself
"""
return os.path.dirname(__file__)
def setup_workdir():
"""
:return: Path of the created working dir
"""
return tempfile.mkdtemp(dir="/tmp", prefix="python-tests-")
def cleanup_workdir(workdir):
"""
FIXME: Danger!
"""
os.system("rm -rf " + workdir)
def diff(result, exp):
"""
Print unified diff.
:param result: Result string
:param exp: Expected result string
"""
diff_ = difflib.unified_diff(result.splitlines(), exp.splitlines(),
'Result', 'Expected')
return "\n'" + "\n".join(diff_) + "'"
# vim:sw=4:ts=4:et:

23
tests/parser.py Normal file
View file

@ -0,0 +1,23 @@
#
# Copyright (C). 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
# pylint: disable=missing-docstring, invalid-name
from __future__ import absolute_import, with_statement
import os.path
import os
import unittest
import smhtml.parser as TT
import tests.common as TC
class Test(unittest.TestCase):
def test_20_parse(self):
ipath = os.path.join(TC.selfdir(), "res/python-smhtml.mhtml")
res = TT.parse(ipath)
self.assertTrue(res)
# vim:sw=4:ts=4:et:

21969
tests/res/python-smhtml.mhtml Normal file

File diff suppressed because it is too large Load diff

14
tox.ini Normal file
View file

@ -0,0 +1,14 @@
[tox]
envlist = py27, py34, py35, py36, py37
[flake8]
exclude = .git,.tox,dist,*egg,setup.py
[testenv]
deps = -r{toxinidir}/pkg/test_requirements.txt
commands =
flake8 --doctests smhtml tests
- pylint --disable=invalid-name,locally-disabled smhtml
python -m nose -v --with-doctest --all-modules --where tests --with-coverage --cover-tests
setenv =
PYTHONPATH = {toxinidir}/