start rework

2021-07-23 03:00:36 -04:00 · 2021-07-23 03:00:36 -04:00 · 0358c971aa
parent 56271cee9a
commit 0358c971aa
23 changed files with 172 additions and 507 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2019 Satoru SATOH
+Copyright (c) 2021 Zoey Mae

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +0,0 @@
-include LICENSE
-include MANIFEST.in
-include README.*
-include setup.*
-include tox.ini .gitignore .travis.yml
-include pkg/*
-include examples/*
-recursive-include smhtml *.py
-recursive-include tests *.py
--- a/README.md
+++ b/README.md
@ -0,0 +1,7 @@
+# PyMHTML
+
+A simple library to parse MHTML files and compile them to a single HTML string. Can export the string as an HTML file or even export the original HTML file and all the supporting files
+
+Forked from [SMHTML](https://github.com/ssato/python-smhtml)
+
+Note: work in progress
--- a/README.rst
+++ b/README.rst
@ -1,44 +0,0 @@
-=================
-python-smhtml
-=================
-
-About
-======
-
-.. .. image:: https://img.shields.io/pypi/v/smhtml.svg
-   :target: https://pypi.python.org/pypi/smhtml/
-   :alt: [Latest Version]
-
-.. .. image:: https://img.shields.io/pypi/pyversions/smhtml.svg
-   :target: https://pypi.python.org/pypi/smhtml/
-   :alt: [Python versions]
-
-.. image:: https://api.travis-ci.org/ssato/python-smhtml.png
-   :target: https://travis-ci.org/ssato/python-smhtml
-   :alt: [Test status]
-
-.. image:: https://coveralls.io/repos/ssato/python-smhtml/badge.png
-   :target: https://coveralls.io/r/ssato/python-smhtml
-   :alt: [Coverage Status]
-
-.. .. image:: https://landscape.io/github/ssato/python-smhtml/master/landscape.png
-   :target: https://landscape.io/github/ssato/python-smhtml/master
-   :alt: [Code Health]
-
-This is a simple and experimental python library to load MHTML files and
-extract files from them, and dump (make) MHTML [#]_ data from files.
-
- Author: Satoru SATOH <satoru.satoh@gmail.com>
- License: MIT
-
-.. [#] https://en.wikipedia.org/wiki/MHTML
-
-Misc
-======
-
-Here is a demo screenshot to show its CLI frontend can extract files from MHTML
-data and make (dump) MHTML data from files.
-
-.. image:: examples/smhtml_cli-screenshot-0.png
-
-.. vim:sw=2:ts=2:et:
--- a/examples/smhtml_cli-screenshot-0.png
+++ b/examples/smhtml_cli-screenshot-0.png
--- a/pkg/nose.cfg
+++ b/pkg/nose.cfg
@ -1,10 +0,0 @@
-[nosetests]
-verbosity=2
-with-doctest=1
-#all-modules=1
-# Requires that nosetest processes multiple modules' test cases at once.
-# processes=4
-
-# coverage:
-cover-package=smhtml
-cover-branches=1
--- a/pkg/package.spec.in
+++ b/pkg/package.spec.in
@ -1,93 +0,0 @@
-%global pkgname smhtml
-
-%if 0%{?fedora} || 0%{?rhel} > 7 || 0%{?epel} > 7
-%global with_python3 1
-%endif
-
-%global desc \
-A simple and experimental python library to parse and dump MHTML data.
-
-Name:           python-%{pkgname}
-Version:        @VERSION@
-Release:        1%{?dist}
-Summary:        Python library to parse and dump MHTML data
-Group:          Development/Tools
-License:        MIT
-URL:            https://github.com/ssato/python-smhtml
-Source0:        %{url}/archive/RELEASE_%{version}.tar.gz
-BuildArch:      noarch
-%if 0%{?with_python3}
-BuildRequires:  python3-devel
-BuildRequires:  python3-setuptools
-%else
-BuildRequires:  python2-setuptools
-BuildRequires:  python2-devel
-%endif
-
-%description %{desc}
-
-%if 0%{?with_python3}
-%package -n python3-%{pkgname}
-Summary:        %{summary}
-Requires:       python3-chardet
-%{?python_provide:%python_provide python3-%{pkgname}}
-
-%description -n python3-%{pkgname} %{desc}
-%else
-
-%package -n python2-%{pkgname}
-Summary:        %{summary}
-Requires:       python2-chardet
-%{?python_provide:%python_provide python2-%{pkgname}}
-
-%description -n python2-%{pkgname} %{desc}
-%endif
-
-%prep
-%autosetup -n %{pkgname}-%{version}
-
-%build
-%if 0%{?with_python3}
-%py3_build
-%else
-%py2_build
-# Dirty hacks.
-test -d %{buildroot}%{python_sitelib}/%{pkgname} || {
-cp -a src/%{pkgname} %{buildroot}%{python_sitelib}/
-}
-test -d %{buildroot}/usr/bin || {
-install -d %{buildroot}/usr/bin
-cat << EOF > %{buildroot}/usr/bin/smhtml_cli
-#! /usr/bin/python
-import sys
-from pkg_resources import load_entry_point
-
-if __name__ == '__main__':
-    sys.exit(
-        load_entry_point('smhtml', 'console_scripts', 'smhtml_cli')()
-    )
-EOF
-}
-chmod +x %{buildroot}/usr/bin/smhtml_cli
-%endif
-
-%install
-%if 0%{?with_python3}
-%py3_install
-%else
-%py2_install
-%endif
-
-%if 0%{?with_python3}
-%files -n python3-%{pkgname}
-%{python3_sitelib}/%{pkgname}*
-%else
-%files -n python2-%{pkgname}
-%{python_sitelib}/%{pkgname}*
-%endif
-%doc README.rst
-%{_bindir}/*
-
-%changelog
-* Mon Feb 25 2019 Satoru SATOH <ssato@redhat.com> - 0.0.1-1
- Initial packaging
--- a/pkg/requirements.txt
+++ b/pkg/requirements.txt
@ -1 +0,0 @@
-chardet
--- a/pkg/test_requirements.txt
+++ b/pkg/test_requirements.txt
@ -1,6 +0,0 @@
-r requirements.txt
-coveralls
-flake8 < 3.5.0
-nose
-pycodestyle < 2.4.0
-pylint
--- a/pymhtml/init.py
+++ b/pymhtml/init.py
@ -0,0 +1,14 @@
+AUTHOR = 'Zoey Mae'
+VERSION = (0, 1, 0)
+LICENSE = 'MIT'
+PACKAGE = 'PyMHTML'
+REPO = 'https://git.barkshark.xyz/izaliamae/pymhtml'
+
+__version__ = '.'.join(str(v) for v in VERSION)
+
+from .misc import MhtmlObject, DotDict
+from .parser import MhtmlParser
+
+
+def parse(filename):
+	return MhtmlParser(filename)
--- a/pymhtml/main.py
+++ b/pymhtml/main.py
@ -0,0 +1,25 @@
+import sys
+
+from pathlib import Path
+
+from .parser import MhtmlParser
+
+
+try:
+	raw_path = Path(sys.argv[1])
+
+	if str(raw_path).startswith('~'):
+		path = raw_path.expanduser()
+
+	else:
+		path = raw_path.resolve()
+
+except IndexError:
+	raise IndexError('Forgot to specify an MHTML file to parse') from None
+
+test = MhtmlParser(path)
+print('MHTML Object:', test)
+print('HTML File:', test.html)
+
+for part in test.files.values():
+	print('MHTML Part:', part)
--- a/pymhtml/cli.py
+++ b/pymhtml/cli.py
--- a/pymhtml/dumper.py
+++ b/pymhtml/dumper.py
--- a/pymhtml/misc.py
+++ b/pymhtml/misc.py
@ -0,0 +1,40 @@
+import chardet, mimetypes
+
+from pathlib import Path
+
+
+class DotDict(dict):
+	__getattr__ = dict.__getitem__
+	__setattr__ = dict.__setitem__
+	__delattr__ = dict.__getitem__
+
+
+	def copy(self):
+		return DotDict(self)
+
+class MhtmlObject(DotDict):
+	def __init__(self, idx, part):
+		super().__init__()
+
+		self.body = part.get_payload(decode=True)
+		self.mimetype = part.get_content_type()
+		self.base_type = part.get_content_maintype()
+		self.location = part.get_all('Content-Location')[0]
+		self.filename = part.get_filename()
+		self.encoding = None
+
+		if not self.filename:
+			if self.location.startswith('data:'):
+				ext = mimetypes.guess_extension(part.get_content_type()) or '.bin'
+				self.filename = f'part-{idx}{ext}'
+
+			else:
+				self.filename = Path(self.location).name.split('?', 1)[0]
+
+		if self.base_type == 'text':
+			self.encoding = chardet.detect(self.body)['encoding']
+			self.body = self.body.decode(self.encoding or 'ascii')
+
+
+	def __str__(self):
+		return f'{self.__class__.__name__}(name="{self.filename}", location="{self.location}", mimetype="{self.mimetype}")'
--- a/pymhtml/parser.py
+++ b/pymhtml/parser.py
@ -0,0 +1,54 @@
+#
+# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
+# License: MIT
+#
+# pylint: disable=unused-import
+r"""Load and parse MHTML data.
+
+.. versionadded:: 0.0.1
+"""
+
+import email
+
+from pathlib import Path
+
+from .misc import MhtmlObject
+
+
+class MhtmlParser:
+	def __init__(self, filepath):
+		self.filepath = Path(filepath).resolve()
+		self.html = None
+		self.files = {}
+
+		self.parse_file()
+
+
+	def __str__(self):
+		return f'{self.__class__.__name__}(file="{self.filepath}", page="{self.html.location}")'
+
+
+	def parse_file(self):
+		mdata = email.message_from_string(self.filepath.open('r').read())
+
+		if not mdata.is_multipart():
+			raise TypeError('Not an MHTML file or missing MIME data')
+
+		for idx, part in enumerate(mdata.walk()):
+			if part.get_content_maintype() == "multipart":
+				continue
+
+			data = MhtmlObject(idx, part)
+
+			if data.mimetype == 'text/html' and not self.html:
+				self.html = data
+
+			else:
+				self.files[data.filename] = data
+
+
+	def build_page(self):
+		pass
+
+
+
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+chardet>=4.0.0
--- a/setup.py
+++ b/setup.py
@ -1,46 +1,33 @@
-from __future__ import absolute_import
-
-import os.path
-import os
-import re
-import sys
-import setuptools
-import setuptools.command.bdist_rpm
+#!/usr/bin/env python3
+from setuptools import setup, find_packages


-# It might throw IndexError and so on.
-VERSION = [re.search(r'^VERSION = "([^"]+)"', l).groups()[0] for l
-           in open("smhtml/globals.py").readlines()
-           if "VERSION" in l][0]
+setup(
+	name='PyMHTML',
+	version='0.1.0',
+	packages=find_packages(),
+	python_requires='>=3.3.0',
+	include_package_data=False,
+	author='Zoey Mae',
+	author_email='admin@barkshark.xyz',
+	description='Simple MHTML parser and exporter',
+	keywords='web html mhtml',
+	url='https://git.barkshark.xyz/izaliamae/pymhtml',
+	project_urls={
+		'Bug Tracker': 'https://git.barkshark.xyz/izaliamae/pymhtml/issues',
+		'Documentation': 'https://git.barkshark.xyz/izaliamae/pymhtml/wiki',
+		'Source Code': 'https://git.barkshark.xyz/izaliamae/pymhtml'
+	},

-# For daily snapshot versioning mode:
-if os.environ.get("_SNAPSHOT_BUILD", None) is not None:
-    import datetime
-    VERSION = VERSION + datetime.datetime.now().strftime(".%Y%m%d")
-
-
-class bdist_rpm(setuptools.command.bdist_rpm.bdist_rpm):
-    """Override the default content of the RPM SPEC.
-    """
-    spec_tmpl = os.path.join(os.path.abspath(os.curdir),
-                             "pkg/package.spec.in")
-
-    def _replace(self, line):
-        """Replace some strings in the RPM SPEC template"""
-        if "@VERSION@" in line:
-            return line.replace("@VERSION@", VERSION)
-
-        if "Source0:" in line:  # Dirty hack
-            return "Source0: %{pkgname}-%{version}.tar.gz"
-
-        return line
-
-    def _make_spec_file(self):
-        return [self._replace(l.rstrip()) for l
-                in open(self.spec_tmpl).readlines()]
-
-
-setuptools.setup(name="smhtml", version=VERSION,
-                 cmdclass=dict(bdist_rpm=bdist_rpm))
-
-# vim:sw=4:ts=4:et:
+	classifiers=[
+		'License :: License :: OSI Approved :: MIT License',
+		'Development Status :: 3 - Alpha',
+		'Programming Language :: Python :: 3.3',
+		'Operating System :: POSIX',
+		'Operating System :: MacOS :: MacOS X',
+		'Operating System :: Microsoft :: Windows',
+		'Topic :: Internet :: WWW/HTTP',
+		'Topic :: Software Development :: Libraries',
+		'Topic :: Software Development :: Libraries :: Python Modules'
+	]
+)
--- a/smhtml/init.py
+++ b/smhtml/init.py
@ -1,27 +0,0 @@
-#
-# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
-# License: MIT
-#
-r"""
-.. module:: smhtml
-   :platform: Unix, Windows
-   :synopsis: Simple python library to load, extract and dump MHTML data
-
-python-smhtml is a simple and experimental python library to load MHTML files and
-extract files from them, and dump (make) MHTML data from files.
-
- Home: https://github.com/ssato/python-smhtml
-
-About MHTML format, please refer other web pages such like
-https://en.wikipedia.org/wiki/MHTML.
-"""
-from .api import (
-    AUTHOR, VERSION, load, loads, dump, dumps, extract  # flake8: noqa
-)
-
-__author__ = AUTHOR
-__version__ = VERSION
-
-__all__ = ["load", "load", "extract", "dump", "dumps"]
-
-# vim:sw=4:ts=4:et:
--- a/smhtml/api.py
+++ b/smhtml/api.py
@ -1,24 +0,0 @@
-#
-# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
-# License: MIT
-#
-# pylint: disable=unused-import
-r"""Public APIs of smhtml module.
-
-.. versionadded:: 0.0.1
-"""
-from __future__ import absolute_import
-
-from .globals import (
-    PACKAGE, AUTHOR, VERSION, LOGGER  # flake8: noqa
-)
-from .loader import load, loads, extract  # flake8: noqa
-from .dumper import dump, dumps  # flake8: noqa
-
-
-def version():
-    """:return: Version info tuple, (major, minor, release), e.g. (0, 8, 2)
-    """
-    return VERSION.split('.')
-
-# vim:sw=4:ts=4:et:
--- a/smhtml/globals.py
+++ b/smhtml/globals.py
@ -1,16 +0,0 @@
-#
-# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
-# License: MIT
-#
-"""Some globals of smhtml module.
-"""
-import logging
-
-
-PACKAGE = "smhtml"
-AUTHOR = "Satoru SATOH <satoru.satoh@gmail.com>"
-VERSION = "0.0.1"
-
-LOGGER = logging.getLogger(PACKAGE)
-
-# vim:sw=4:ts=4:et:
--- a/smhtml/loader.py
+++ b/smhtml/loader.py
@ -1,185 +0,0 @@
-#
-# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
-# License: MIT
-#
-# pylint: disable=unused-import
-r"""Load and parse MHTML data.
-
-.. versionadded:: 0.0.1
-"""
-from __future__ import absolute_import
-
-import email
-import mimetypes
-import os.path
-import os
-
-import smhtml.utils
-from smhtml.globals import LOGGER
-
-
-def decode_part(part):
-    """
-    Decode a part of MIME multi-part data.
-
-    :param part: :class:`email.mime.base.MIMEBase` object
-    :return: A dict contains various info of given MIME `part` data
-    """
-    bdata = part.get_payload(decode=True)
-    ctype = part.get_content_type()
-    mtype = part.get_content_maintype()
-
-    if mtype == "text":
-        charset = smhtml.utils.detect_charset(bdata)
-        data = bdata.decode(charset, "ignore")
-    else:
-        charset = None
-        data = bdata
-
-    location = part.get_all("Content-Location")
-    return dict(type=ctype, encoding=charset, data=data, payload=bdata,
-                location=location[0] if location else None)
-
-
-def get_or_gen_filename(part, idx=0):
-    """
-    Get the filename from given MIME `part` data or generate filename to be
-    used to save its payload later.
-
-    :param part: :class:`email.mime.base.MIMEBase` object
-    :return: A filename as a string
-    """
-    filename = part.get_filename()
-    if not filename:
-        fileext = mimetypes.guess_extension(part.get_content_type())
-        if not fileext:
-            fileext = ".bin"
-        filename = "part-%03d%s" % (idx, fileext)
-
-    return filename
-
-
-def parse_itr(mdata):
-    """
-    An iterator to yield each info from given MIME multi-part data.
-
-    :param mdata: :class:`email.message.Message` object
-    :return: A generator yields info of each part in `mdata`
-    """
-    for idx, part in enumerate(mdata.walk()):
-        if part.get_content_maintype() == "multipart":
-            continue
-
-        filename = get_or_gen_filename(part, idx=idx)
-        info = decode_part(part)
-        info["index"] = idx
-        info["filename"] = filename
-
-        LOGGER.debug("part#%d: filename=%s", idx, filename)
-
-        yield info
-
-
-def loads_itr(content):
-    """
-    An iterator to yield each info from given MIME multi-part data as a string
-    after some checks.
-
-    :param content: Input MHTML data as a string
-    :return: A generator yields info of each part loaded from `content`
-    :raises: ValueError
-    """
-    mdata = email.message_from_string(content)
-
-    if not mdata.is_multipart():
-        raise ValueError("Multi-part MIME data was not found in "
-                         "given string: %s ..." % content[:100])
-
-    for info in parse_itr(mdata):
-        yield info
-
-
-def load_itr(filepath):
-    """
-    An iterator to yield each info from given MIME multi-part data as a file
-    after some checks.
-
-    :param filepath: :class:`pathlib.Path` object or a string represents path
-    :return: A generator yields each part parsed from `filepath` opened
-    :raises: ValueError
-    """
-    with open(filepath) as fobj:
-        mdata = email.message_from_file(fobj)
-
-    if not mdata.is_multipart():
-        raise ValueError("Multi-part MIME data was not found in "
-                         "'%s'" % filepath)
-
-    for info in parse_itr(mdata):
-        yield info
-
-
-def loads(content):
-    """
-    Load and return a list of info of each part of MIME multi-part data from
-    given data as a string.
-
-    :param content: Input MHTML data as a string
-    :return: A list of info of each part of MIME multi-part data
-    :raises: ValueError
-    """
-    return list(loads_itr(content))
-
-
-def load(filepath):
-    """
-    Load and return a list of info of each part of MIME multi-part data from
-    given data as a file.
-
-    :param filepath: :class:`pathlib.Path` object or a string represents path
-    :return: A list of info of each part of MIME multi-part data
-    :raises: ValueError
-    """
-    return list(load_itr(filepath))
-
-
-def extract(filepath, output, usebasename=False, outputfilenamer=None):
-    """
-    Load and extract each part of MIME multi-part data as files from given data
-    as a file.
-
-    :param filepath: :class:`pathlib.Path` object represents input
-    :param output: :class:`pathlib.Path` object represents output dir
-    :param usebasename: Use the basename, not full path, when writing files
-    :param outputfilenamer: Callback fn takes `inf` and returns a filename
-    For example, it could return a filename based on `inf['location']`
-    :raises: ValueError
-    """
-    if output == "-":
-        raise ValueError("Output dir must be given to extract")
-
-    if os.path.exists(output) and os.path.isfile(output):
-        raise OSError("Output '%s' already exists as a file!" % output)
-
-    os.makedirs(output)
-    for inf in load_itr(filepath):
-        filename = inf["filename"]
-
-        if usebasename:
-            filename = os.path.split(filename)[-1]
-
-        if outputfilenamer:
-            filename = outputfilenamer(inf)
-
-        outpath = os.path.join(output, filename)
-        outdir = os.path.dirname(outpath)
-
-        LOGGER.debug("Extract %s from %s", filename, filepath)
-
-        if not os.path.exists(outdir):
-            os.makedirs(outdir)
-
-        with open(outpath, "wb") as out:
-            out.write(inf["payload"])
-
-# vim:sw=4:ts=4:et:
--- a/smhtml/utils.py
+++ b/smhtml/utils.py
@ -1,34 +0,0 @@
-#
-# -*- coding: utf-8; mode: python -*-
-#
-# Copyright (C) 2019 Satoru SATOH <satoru.satoh@gmail.com>
-# License: MIT
-#
-# pylint: disable=unused-import
-r"""Utility functions.
-
-.. versionadded:: 0.0.1
-"""
-from __future__ import absolute_import
-
-import chardet
-
-
-def detect_charset(bmsg, default="ascii"):
-    r"""
-    :param bmsg: A byte data to detect charset
-    :return: A string represents charset such as 'utf-8', 'iso-2022-jp'
-
-    >>> detect_charset(b"a")
-    'ascii'
-    >>> detect_charset(b"")
-    'ascii'
-    >>> detect_charset(u"あ".encode("utf-8"))
-    'utf-8'
-    """
-    if not bmsg:
-        return default
-
-    return chardet.detect(bmsg)["encoding"]
-
-# vim:sw=4:ts=4:et:
--- a/tox.ini
+++ b/tox.ini
@ -1,14 +0,0 @@
-[tox]
-envlist = py27, py34, py35, py36, py37
-
-[flake8]
-exclude = .git,.tox,dist,*egg,setup.py
-
-[testenv]
-deps = -r{toxinidir}/pkg/test_requirements.txt
-commands =
-        flake8 --doctests smhtml tests
-        - pylint --disable=invalid-name,locally-disabled smhtml
-        python -m nose -v --with-doctest --all-modules --where tests --with-coverage --cover-tests --cover-package=smhtml
-setenv =
-    PYTHONPATH = {toxinidir}/