start rework

This commit is contained in:
Izalia Mae 2021-07-23 03:00:36 -04:00
parent 56271cee9a
commit 0358c971aa
23 changed files with 172 additions and 507 deletions

View file

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2019 Satoru SATOH
Copyright (c) 2021 Zoey Mae
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View file

@ -1,9 +0,0 @@
include LICENSE
include MANIFEST.in
include README.*
include setup.*
include tox.ini .gitignore .travis.yml
include pkg/*
include examples/*
recursive-include smhtml *.py
recursive-include tests *.py

7
README.md Normal file
View file

@ -0,0 +1,7 @@
# PyMHTML
A simple library to parse MHTML files and compile them to a single HTML string. Can export the string as an HTML file or even export the original HTML file and all the supporting files
Forked from [SMHTML](https://github.com/ssato/python-smhtml)
Note: work in progress

View file

@ -1,44 +0,0 @@
=================
python-smhtml
=================
About
======
.. .. image:: https://img.shields.io/pypi/v/smhtml.svg
:target: https://pypi.python.org/pypi/smhtml/
:alt: [Latest Version]
.. .. image:: https://img.shields.io/pypi/pyversions/smhtml.svg
:target: https://pypi.python.org/pypi/smhtml/
:alt: [Python versions]
.. image:: https://api.travis-ci.org/ssato/python-smhtml.png
:target: https://travis-ci.org/ssato/python-smhtml
:alt: [Test status]
.. image:: https://coveralls.io/repos/ssato/python-smhtml/badge.png
:target: https://coveralls.io/r/ssato/python-smhtml
:alt: [Coverage Status]
.. .. image:: https://landscape.io/github/ssato/python-smhtml/master/landscape.png
:target: https://landscape.io/github/ssato/python-smhtml/master
:alt: [Code Health]
This is a simple and experimental python library to load MHTML files and
extract files from them, and dump (make) MHTML [#]_ data from files.
- Author: Satoru SATOH <satoru.satoh@gmail.com>
- License: MIT
.. [#] https://en.wikipedia.org/wiki/MHTML
Misc
======
Here is a demo screenshot to show its CLI frontend can extract files from MHTML
data and make (dump) MHTML data from files.
.. image:: examples/smhtml_cli-screenshot-0.png
.. vim:sw=2:ts=2:et:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 208 KiB

View file

@ -1,10 +0,0 @@
[nosetests]
verbosity=2
with-doctest=1
#all-modules=1
# Requires that nosetest processes multiple modules' test cases at once.
# processes=4
# coverage:
cover-package=smhtml
cover-branches=1

View file

@ -1,93 +0,0 @@
%global pkgname smhtml
%if 0%{?fedora} || 0%{?rhel} > 7 || 0%{?epel} > 7
%global with_python3 1
%endif
%global desc \
A simple and experimental python library to parse and dump MHTML data.
Name: python-%{pkgname}
Version: @VERSION@
Release: 1%{?dist}
Summary: Python library to parse and dump MHTML data
Group: Development/Tools
License: MIT
URL: https://github.com/ssato/python-smhtml
Source0: %{url}/archive/RELEASE_%{version}.tar.gz
BuildArch: noarch
%if 0%{?with_python3}
BuildRequires: python3-devel
BuildRequires: python3-setuptools
%else
BuildRequires: python2-setuptools
BuildRequires: python2-devel
%endif
%description %{desc}
%if 0%{?with_python3}
%package -n python3-%{pkgname}
Summary: %{summary}
Requires: python3-chardet
%{?python_provide:%python_provide python3-%{pkgname}}
%description -n python3-%{pkgname} %{desc}
%else
%package -n python2-%{pkgname}
Summary: %{summary}
Requires: python2-chardet
%{?python_provide:%python_provide python2-%{pkgname}}
%description -n python2-%{pkgname} %{desc}
%endif
%prep
%autosetup -n %{pkgname}-%{version}
%build
%if 0%{?with_python3}
%py3_build
%else
%py2_build
# Dirty hacks.
test -d %{buildroot}%{python_sitelib}/%{pkgname} || {
cp -a src/%{pkgname} %{buildroot}%{python_sitelib}/
}
test -d %{buildroot}/usr/bin || {
install -d %{buildroot}/usr/bin
cat << EOF > %{buildroot}/usr/bin/smhtml_cli
#! /usr/bin/python
import sys
from pkg_resources import load_entry_point
if __name__ == '__main__':
sys.exit(
load_entry_point('smhtml', 'console_scripts', 'smhtml_cli')()
)
EOF
}
chmod +x %{buildroot}/usr/bin/smhtml_cli
%endif
%install
%if 0%{?with_python3}
%py3_install
%else
%py2_install
%endif
%if 0%{?with_python3}
%files -n python3-%{pkgname}
%{python3_sitelib}/%{pkgname}*
%else
%files -n python2-%{pkgname}
%{python_sitelib}/%{pkgname}*
%endif
%doc README.rst
%{_bindir}/*
%changelog
* Mon Feb 25 2019 Satoru SATOH <ssato@redhat.com> - 0.0.1-1
- Initial packaging

View file

@ -1 +0,0 @@
chardet

View file

@ -1,6 +0,0 @@
-r requirements.txt
coveralls
flake8 < 3.5.0
nose
pycodestyle < 2.4.0
pylint

14
pymhtml/__init__.py Normal file
View file

@ -0,0 +1,14 @@
AUTHOR = 'Zoey Mae'
VERSION = (0, 1, 0)
LICENSE = 'MIT'
PACKAGE = 'PyMHTML'
REPO = 'https://git.barkshark.xyz/izaliamae/pymhtml'
__version__ = '.'.join(str(v) for v in VERSION)
from .misc import MhtmlObject, DotDict
from .parser import MhtmlParser
def parse(filename):
return MhtmlParser(filename)

25
pymhtml/__main__.py Normal file
View file

@ -0,0 +1,25 @@
import sys
from pathlib import Path
from .parser import MhtmlParser
try:
raw_path = Path(sys.argv[1])
if str(raw_path).startswith('~'):
path = raw_path.expanduser()
else:
path = raw_path.resolve()
except IndexError:
raise IndexError('Forgot to specify an MHTML file to parse') from None
test = MhtmlParser(path)
print('MHTML Object:', test)
print('HTML File:', test.html)
for part in test.files.values():
print('MHTML Part:', part)

40
pymhtml/misc.py Normal file
View file

@ -0,0 +1,40 @@
import chardet, mimetypes
from pathlib import Path
class DotDict(dict):
__getattr__ = dict.__getitem__
__setattr__ = dict.__setitem__
__delattr__ = dict.__getitem__
def copy(self):
return DotDict(self)
class MhtmlObject(DotDict):
def __init__(self, idx, part):
super().__init__()
self.body = part.get_payload(decode=True)
self.mimetype = part.get_content_type()
self.base_type = part.get_content_maintype()
self.location = part.get_all('Content-Location')[0]
self.filename = part.get_filename()
self.encoding = None
if not self.filename:
if self.location.startswith('data:'):
ext = mimetypes.guess_extension(part.get_content_type()) or '.bin'
self.filename = f'part-{idx}{ext}'
else:
self.filename = Path(self.location).name.split('?', 1)[0]
if self.base_type == 'text':
self.encoding = chardet.detect(self.body)['encoding']
self.body = self.body.decode(self.encoding or 'ascii')
def __str__(self):
return f'{self.__class__.__name__}(name="{self.filename}", location="{self.location}", mimetype="{self.mimetype}")'

54
pymhtml/parser.py Normal file
View file

@ -0,0 +1,54 @@
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
# pylint: disable=unused-import
r"""Load and parse MHTML data.
.. versionadded:: 0.0.1
"""
import email
from pathlib import Path
from .misc import MhtmlObject
class MhtmlParser:
def __init__(self, filepath):
self.filepath = Path(filepath).resolve()
self.html = None
self.files = {}
self.parse_file()
def __str__(self):
return f'{self.__class__.__name__}(file="{self.filepath}", page="{self.html.location}")'
def parse_file(self):
mdata = email.message_from_string(self.filepath.open('r').read())
if not mdata.is_multipart():
raise TypeError('Not an MHTML file or missing MIME data')
for idx, part in enumerate(mdata.walk()):
if part.get_content_maintype() == "multipart":
continue
data = MhtmlObject(idx, part)
if data.mimetype == 'text/html' and not self.html:
self.html = data
else:
self.files[data.filename] = data
def build_page(self):
pass

1
requirements.txt Normal file
View file

@ -0,0 +1 @@
chardet>=4.0.0

73
setup.py Normal file → Executable file
View file

@ -1,46 +1,33 @@
from __future__ import absolute_import
import os.path
import os
import re
import sys
import setuptools
import setuptools.command.bdist_rpm
#!/usr/bin/env python3
from setuptools import setup, find_packages
# It might throw IndexError and so on.
VERSION = [re.search(r'^VERSION = "([^"]+)"', l).groups()[0] for l
in open("smhtml/globals.py").readlines()
if "VERSION" in l][0]
setup(
name='PyMHTML',
version='0.1.0',
packages=find_packages(),
python_requires='>=3.3.0',
include_package_data=False,
author='Zoey Mae',
author_email='admin@barkshark.xyz',
description='Simple MHTML parser and exporter',
keywords='web html mhtml',
url='https://git.barkshark.xyz/izaliamae/pymhtml',
project_urls={
'Bug Tracker': 'https://git.barkshark.xyz/izaliamae/pymhtml/issues',
'Documentation': 'https://git.barkshark.xyz/izaliamae/pymhtml/wiki',
'Source Code': 'https://git.barkshark.xyz/izaliamae/pymhtml'
},
# For daily snapshot versioning mode:
if os.environ.get("_SNAPSHOT_BUILD", None) is not None:
import datetime
VERSION = VERSION + datetime.datetime.now().strftime(".%Y%m%d")
class bdist_rpm(setuptools.command.bdist_rpm.bdist_rpm):
"""Override the default content of the RPM SPEC.
"""
spec_tmpl = os.path.join(os.path.abspath(os.curdir),
"pkg/package.spec.in")
def _replace(self, line):
"""Replace some strings in the RPM SPEC template"""
if "@VERSION@" in line:
return line.replace("@VERSION@", VERSION)
if "Source0:" in line: # Dirty hack
return "Source0: %{pkgname}-%{version}.tar.gz"
return line
def _make_spec_file(self):
return [self._replace(l.rstrip()) for l
in open(self.spec_tmpl).readlines()]
setuptools.setup(name="smhtml", version=VERSION,
cmdclass=dict(bdist_rpm=bdist_rpm))
# vim:sw=4:ts=4:et:
classifiers=[
'License :: License :: OSI Approved :: MIT License',
'Development Status :: 3 - Alpha',
'Programming Language :: Python :: 3.3',
'Operating System :: POSIX',
'Operating System :: MacOS :: MacOS X',
'Operating System :: Microsoft :: Windows',
'Topic :: Internet :: WWW/HTTP',
'Topic :: Software Development :: Libraries',
'Topic :: Software Development :: Libraries :: Python Modules'
]
)

View file

@ -1,27 +0,0 @@
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
r"""
.. module:: smhtml
:platform: Unix, Windows
:synopsis: Simple python library to load, extract and dump MHTML data
python-smhtml is a simple and experimental python library to load MHTML files and
extract files from them, and dump (make) MHTML data from files.
- Home: https://github.com/ssato/python-smhtml
About MHTML format, please refer other web pages such like
https://en.wikipedia.org/wiki/MHTML.
"""
from .api import (
AUTHOR, VERSION, load, loads, dump, dumps, extract # flake8: noqa
)
__author__ = AUTHOR
__version__ = VERSION
__all__ = ["load", "load", "extract", "dump", "dumps"]
# vim:sw=4:ts=4:et:

View file

@ -1,24 +0,0 @@
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
# pylint: disable=unused-import
r"""Public APIs of smhtml module.
.. versionadded:: 0.0.1
"""
from __future__ import absolute_import
from .globals import (
PACKAGE, AUTHOR, VERSION, LOGGER # flake8: noqa
)
from .loader import load, loads, extract # flake8: noqa
from .dumper import dump, dumps # flake8: noqa
def version():
""":return: Version info tuple, (major, minor, release), e.g. (0, 8, 2)
"""
return VERSION.split('.')
# vim:sw=4:ts=4:et:

View file

@ -1,16 +0,0 @@
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
"""Some globals of smhtml module.
"""
import logging
PACKAGE = "smhtml"
AUTHOR = "Satoru SATOH <satoru.satoh@gmail.com>"
VERSION = "0.0.1"
LOGGER = logging.getLogger(PACKAGE)
# vim:sw=4:ts=4:et:

View file

@ -1,185 +0,0 @@
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
# pylint: disable=unused-import
r"""Load and parse MHTML data.
.. versionadded:: 0.0.1
"""
from __future__ import absolute_import
import email
import mimetypes
import os.path
import os
import smhtml.utils
from smhtml.globals import LOGGER
def decode_part(part):
"""
Decode a part of MIME multi-part data.
:param part: :class:`email.mime.base.MIMEBase` object
:return: A dict contains various info of given MIME `part` data
"""
bdata = part.get_payload(decode=True)
ctype = part.get_content_type()
mtype = part.get_content_maintype()
if mtype == "text":
charset = smhtml.utils.detect_charset(bdata)
data = bdata.decode(charset, "ignore")
else:
charset = None
data = bdata
location = part.get_all("Content-Location")
return dict(type=ctype, encoding=charset, data=data, payload=bdata,
location=location[0] if location else None)
def get_or_gen_filename(part, idx=0):
"""
Get the filename from given MIME `part` data or generate filename to be
used to save its payload later.
:param part: :class:`email.mime.base.MIMEBase` object
:return: A filename as a string
"""
filename = part.get_filename()
if not filename:
fileext = mimetypes.guess_extension(part.get_content_type())
if not fileext:
fileext = ".bin"
filename = "part-%03d%s" % (idx, fileext)
return filename
def parse_itr(mdata):
"""
An iterator to yield each info from given MIME multi-part data.
:param mdata: :class:`email.message.Message` object
:return: A generator yields info of each part in `mdata`
"""
for idx, part in enumerate(mdata.walk()):
if part.get_content_maintype() == "multipart":
continue
filename = get_or_gen_filename(part, idx=idx)
info = decode_part(part)
info["index"] = idx
info["filename"] = filename
LOGGER.debug("part#%d: filename=%s", idx, filename)
yield info
def loads_itr(content):
"""
An iterator to yield each info from given MIME multi-part data as a string
after some checks.
:param content: Input MHTML data as a string
:return: A generator yields info of each part loaded from `content`
:raises: ValueError
"""
mdata = email.message_from_string(content)
if not mdata.is_multipart():
raise ValueError("Multi-part MIME data was not found in "
"given string: %s ..." % content[:100])
for info in parse_itr(mdata):
yield info
def load_itr(filepath):
"""
An iterator to yield each info from given MIME multi-part data as a file
after some checks.
:param filepath: :class:`pathlib.Path` object or a string represents path
:return: A generator yields each part parsed from `filepath` opened
:raises: ValueError
"""
with open(filepath) as fobj:
mdata = email.message_from_file(fobj)
if not mdata.is_multipart():
raise ValueError("Multi-part MIME data was not found in "
"'%s'" % filepath)
for info in parse_itr(mdata):
yield info
def loads(content):
"""
Load and return a list of info of each part of MIME multi-part data from
given data as a string.
:param content: Input MHTML data as a string
:return: A list of info of each part of MIME multi-part data
:raises: ValueError
"""
return list(loads_itr(content))
def load(filepath):
"""
Load and return a list of info of each part of MIME multi-part data from
given data as a file.
:param filepath: :class:`pathlib.Path` object or a string represents path
:return: A list of info of each part of MIME multi-part data
:raises: ValueError
"""
return list(load_itr(filepath))
def extract(filepath, output, usebasename=False, outputfilenamer=None):
"""
Load and extract each part of MIME multi-part data as files from given data
as a file.
:param filepath: :class:`pathlib.Path` object represents input
:param output: :class:`pathlib.Path` object represents output dir
:param usebasename: Use the basename, not full path, when writing files
:param outputfilenamer: Callback fn takes `inf` and returns a filename
For example, it could return a filename based on `inf['location']`
:raises: ValueError
"""
if output == "-":
raise ValueError("Output dir must be given to extract")
if os.path.exists(output) and os.path.isfile(output):
raise OSError("Output '%s' already exists as a file!" % output)
os.makedirs(output)
for inf in load_itr(filepath):
filename = inf["filename"]
if usebasename:
filename = os.path.split(filename)[-1]
if outputfilenamer:
filename = outputfilenamer(inf)
outpath = os.path.join(output, filename)
outdir = os.path.dirname(outpath)
LOGGER.debug("Extract %s from %s", filename, filepath)
if not os.path.exists(outdir):
os.makedirs(outdir)
with open(outpath, "wb") as out:
out.write(inf["payload"])
# vim:sw=4:ts=4:et:

View file

@ -1,34 +0,0 @@
#
# -*- coding: utf-8; mode: python -*-
#
# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
# License: MIT
#
# pylint: disable=unused-import
r"""Utility functions.
.. versionadded:: 0.0.1
"""
from __future__ import absolute_import
import chardet
def detect_charset(bmsg, default="ascii"):
r"""
:param bmsg: A byte data to detect charset
:return: A string represents charset such as 'utf-8', 'iso-2022-jp'
>>> detect_charset(b"a")
'ascii'
>>> detect_charset(b"")
'ascii'
>>> detect_charset(u"".encode("utf-8"))
'utf-8'
"""
if not bmsg:
return default
return chardet.detect(bmsg)["encoding"]
# vim:sw=4:ts=4:et:

14
tox.ini
View file

@ -1,14 +0,0 @@
[tox]
envlist = py27, py34, py35, py36, py37
[flake8]
exclude = .git,.tox,dist,*egg,setup.py
[testenv]
deps = -r{toxinidir}/pkg/test_requirements.txt
commands =
flake8 --doctests smhtml tests
- pylint --disable=invalid-name,locally-disabled smhtml
python -m nose -v --with-doctest --all-modules --where tests --with-coverage --cover-tests --cover-package=smhtml
setenv =
PYTHONPATH = {toxinidir}/