#!/usr/bin/python3
"""
imgsizer -- correct image sizes in WWW pages
by Eric S. Raymond <esr@thyrsus.com>

Fix up IMG tags in given documents to contain correct sizes.

This code runs under either Python 2 or Python 3. Keep it that way!

SPDX-License-Identifier: BSD-2-clause

from __future__ import print_function

Changelog:

Originally created by Eric S. Raymond <esr@thyrsus.com> 30 Jul 1996

Modified by Erik Rossen <rossen@planet.ch> 15 May 1999

   Added the --nomagick switch, to use file(1) and rdjpgcom(1)
   to determine the image size instead of identify(1) from the
   ImageMagick suite.

Modified by Michael C. Toren <michael@toren.net> 18 Aug 2000

   Fixed bug where the SRC attribute's value needed to be in quotes,
   improved command line parsing (but it could still use some work),
   added -q switch to omit quotes when generating tags, and -l switch
   to generate lowercase tags.  -mct

Modified by Michael C. Toren <michael@toren.net> 19 Aug 2000

   Improved the command line parsing some more, now looks for additional
   arguments via an IMGSIZER environmental variable, added the -d switch
   to set the DocumentRoot, -v switch to display version information,
   and -h switch to display usage information.  -mct

Modified by Michael C. Toren <michael@toren.net> 23 Feb 2001

   Fixed two bugs reported by Jeroen Valcke <jeroen@valcke.com>, one
   where the -d switch did not function properly if the img src attribute
   was quoted, and another where the &error sub was incorrectly reporting
   the line number an error occurred due to the input record separator
   being set to ">".

Rewritten in Python by Eric S. Raymond <esr@thyrsus.com> 11 July 2001

   Time to get rid of the dependency on httpget.  The -l option is gone, too;
   instead, we deduce the right case by looking at the leading tag.  -q
   is gone; we always emit without quotes.  -m is gone too, instead we
   try commands in least to most expensive order, and notice when a command
   returns not to try it again.

Fixes by ESR, 29 July 2001

   Incorporated fixes by Peter S. Galbraith.

Fixes by ESR, 25 April 2003

   Merged amended versions of Lennart Poettering's fix for Debian bug 139714.
   and Jeroen N. Witmond's fix for Debian bug 168964.  Added regression-test
   production.

Enhancement by ESR, 14 Nov 2003

   Verify and merge Lucien Saviot's patch to produce XHTML from XHTML input.
   Also his change to handle spurious lin e breaks produced by Dave Raggett's
   tidy(1) utility.

Modified by Andrew Gwozdziewycz <gwozdzie@lucas.cis.temple.edu>, 17 June 2004

   Added support for the Python Imaging Library to determine size in case of
   failure from file(1), rdjpgcom(1) and identify(1).

SPDX-License-Identifier: BSD-2-Clause
"""

# pylint: disable=invalid-name,missing-function-docstring,,redefined-outer-name,global-statement.too-many-branches,no-else-return,raise-missing-from,no-else-raise,consider-using-f-string,consider-using-with

# pylint: disable=multiple-imports
import sys, os, getopt, re, filecmp

# Warning: In some Python 3 versions getstatusoutput() returns
# status incorrectly so that a nonzero exit looks like the subprocess
# was signaled!  (Observed under 3.4.3; Debian bug #764848)
try:
    from subprocess import getstatusoutput
except ImportError:
    from commands import getstatusoutput

try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve

version = "2.12"

splash = """imgsizer version %s, Eric S. Raymond <esr@thyrsus.com>
See <http://www.catb.org/~esr/software.html> for updates."""

usage = """Usage: imgsizer [OPTIONS] [HTML File]

Options:

    -V, --version

        Display version information and exit.

    -h, --help

        Display usage information.

    -d <directory>, --document-root <directory>

        Directory where absolute image filenames (i.e, ones which contain
        a leading "/") may be found.

    -n, --no-overwrite

        Don't overwrite existing width and height tags if both are present.

"""

# Optimization latches -- if an attempt  to invoke a command returns 127
# "not found" there will turn off and that command won't be tried again.
magick = 1  # using ImageMagick by default
rdjpgcom = 1  # using rdjpcom by default
pythonimage = 1  # use python imaging library


def attrformat(xc, dim):
    "Formst an attrubute"
    if lower:
        res = " " + dim
    else:
        res = " " + dim.upper()
    res = res + '="' + str(xc) + '"'
    return res


def sizefix(infp, outfp):
    # Apply attrfix to the attributes in each image tag
    global lower
    while 1:
        ch = infp.read(1)
        if ch == "":
            return
        outfp.write(ch)
        if ch == "<":
            # within an HTML tag
            lead = infp.read(2)
            outfp.write(lead)
            if not lead in ("im", "IM"):
                continue
            # splitting the read this way copes with single-char tags like <b>
            lead = lead + infp.read(1)
            outfp.write(lead[-1])
            if not lead in ("img", "IMG"):
                continue
            # within an image tag
            lower = lead == "img"
            attributes = ""
            while 1:
                ch = infp.read(1)
                if ch == "":
                    return
                if ch == ">":
                    break
                if ch == "/":
                    ch2 = infp.read(1)
                    ch = ch + ch2
                    if ch2 == ">":
                        break
                attributes = attributes + ch
            outfp.write(transform(attributes) + ch)


x_match = re.compile(r" ([0-9]+) *x *([0-9]+)")
rdjpg_match = re.compile(r" ([0-9]+)w *\* *([0-9]+)h")


def imgsize(src):
    "Return the image size in pixels for a given image source."
    global magick, rdjpgcom, pythonimage
    try:
        if not ":" in src:
            src = "file:" + src
        (filename, _headers) = urlretrieve(src)
    except IOError:
        return None
    # Now let's see if we can get a size for the retrieved image.
    # Try file(1) first -- cheapest, as it doesn't read the whole image
    (status, output) = getstatusoutput("file " + filename)
    if status == 0:
        # file(1) works for every common image format other than JPEG
        if output.find("JPEG") == -1:
            sizes = x_match.search(output)
            if sizes:
                return (sizes.group(1), sizes.group(2))
        elif rdjpgcom:
            # Use rdjpgcom(1) to handle JPEGs
            (status, output) = getstatusoutput("rdjpgcom -verbose " + filename)
            sizes = rdjpg_match.search(output)
            if sizes:
                return (sizes.group(1), sizes.group(2))
            elif status == 127:
                rdjpgcom = 0
    # Next try identify(1), more expensive but bulletproof
    if magick:
        (status, output) = getstatusoutput("identify " + filename)
        if status == 0:
            sizes = x_match.search(output)
            if sizes:
                return (sizes.group(1), sizes.group(2))
        elif status == 127:
            sys.stderr.write("imgsizer: giving up on ImageMagick\n")
            magick = 0
    # if that fails, try at _LAST_ resort Python Imaging Library
    # open doesn't actually load all the data, so it shouldn't be too expensive
    if pythonimage:
        try:
            # pylint: disable=import-outside-toplevel
            import Image

            pyimg = Image.open(filename)
            return pyimg.size
        except (ImportError, IOError):
            sys.stderr.write("imgsizer: giving up on Python Imaging Library\n")
            pythonimage = 0

    # All attempts failed
    sys.stderr.write("imgsizer: couldn't analyze %s\n" % src)
    return None


source = re.compile(r'SRC\s*=\s*"?([^" \t\n]*)"?', re.I)
awidth = re.compile(r' *WIDTH\s*=\s*"?[0-9]*"?', re.I)
aheight = re.compile(r' *HEIGHT\s*=\s*"?[0-9]*"?', re.I)
pwidth = re.compile(r'WIDTH\s*=\s*"?[0-9]*%"?', re.I)
pheight = re.compile(r'HEIGHT\s*=\s*"?[0-9]*%"?', re.I)


def transform(attr):
    src = source.search(attr)
    # Must have a source part and no percents in existing width or height
    if not src or pwidth.search(attr) or pheight.search(attr):
        return attr
    if no_overwrite and awidth.search(attr) and aheight.search(attr):
        return attr
    # Correct the url for documentation root, if present
    url = src.group(1)
    if url[0] == "/" and root:
        url = os.path.join(root, url[1:])
    # OK, get the size tuple if possible
    dimensions = imgsize(url)
    if not dimensions:
        return attr
    else:
        # Nuke any old size attr
        if not no_overwrite:
            attr = re.sub(awidth, "", attr)
            attr = re.sub(aheight, "", attr)
        # Compute image dimensions
        (xc, yc) = dimensions
        # Plug in the new attr
        return attr + attrformat(xc, "width") + attrformat(yc, "height")


# Output lowercase tags by default.
lower = 1

# Set the default DocumentRoot to the current working directory.
root = "."

out = "imgsizer-out$$"
mydir = "."  # NOTE: if you are doing <yourfile make sure that pwd is correct!

# Collect options from the environment first, then the command line
options = os.environ.get("IMGSIZER")
if options:
    options = options.split()
else:
    options = []
options = options + sys.argv[1:]

# Process options
(options, arguments) = getopt.getopt(
    options, "Vhd:n", ("version", "help", "usage", "document=", "no-overwrite")
)
no_overwrite = 0
for (switch, val) in options:
    if switch in ("-V", "--version"):
        print(splash % version)
        raise SystemExit
    elif switch in ("-h", "--help", "--usage"):
        print(splash + "\n\n" + usage)
        raise SystemExit
    elif switch in ("-d", "--document"):
        root = val
        if not os.path.isdir(root):
            print("Document root isn't a directory")
            raise SystemExit(1)
    elif switch in ("-n", "--no-overwrite"):
        no_overwrite = 1

if not arguments:
    sizefix(sys.stdin, sys.stdout)
else:
    for myfile in arguments:
        try:
            infp = open(myfile, encoding="ascii", errors="surrogateescape")
        except:
            print("imgsizer: can't open input file", myfile)
            raise SystemExit(1)
        tempfile = myfile + ".~imgsizer-%d~" % os.getpid()
        try:
            outfp = open(tempfile, "w", encoding="ascii", errors="surrogateescape")
        except OSError:
            print("imgsizer: can't open tempfile")
            raise SystemExit(1)
        sizefix(infp, outfp)
        if filecmp.cmp(myfile, tempfile):
            os.remove(tempfile)
        else:
            try:
                os.rename(tempfile, myfile)
            except OSError:
                sys.stderr.write("imgsize: couldn't replace " + myfile)
                os.remove(tempfile)

# End
