republisher/repub/srcset.py

from __future__ import unicode_literals

import math

# See https://infra.spec.whatwg.org/#ascii-whitespace
WHITESPACES = ("\u0009", "\u000a", "\u000c", "\u000d", "\u0020")  # \t  # " "

STATE_IN_DESCRIPTOR = 1
STATE_AFTER_DESCRIPTOR = 2
STATE_IN_PARENS = 3


class SRCSet(object):
    raw = None
    candidates = None

    def __init__(self, string):
        self.raw = string

    def parse(self):
        """
        Based on algorithm from https://html.spec.whatwg.org/multipage/images.html#parse-a-srcset-attribute
        """
        # Step 1, 2, 3
        pos = 0
        candidates = []
        state = None

        # Step 4
        while True:
            pos, _ = collect_characters_in(self.raw, pos, WHITESPACES + (",",))

            # Step 5
            if pos >= len(self.raw):
                # The only one place where we leave the loop
                self.candidates = candidates
                return candidates

            # Step 6
            pos, url = collect_characters_out(self.raw, pos, WHITESPACES)

            # Step 7
            descriptors = []

            # Step 8.1
            if url[-1] == ",":
                while len(url) and url[-1] == ",":
                    url = url[:-1]
                # JUMP to descriptor parser
            else:
                # Step 8.e.1
                pos, _ = collect_characters_in(self.raw, pos, WHITESPACES)

                # Step 8.e.2
                current_descriptor = ""
                state = STATE_IN_DESCRIPTOR

                # Step 8.e.4
                while True:
                    if pos < len(self.raw):
                        cc = self.raw[pos]
                    else:
                        cc = None
                    if state == STATE_IN_DESCRIPTOR:
                        if cc in WHITESPACES:
                            if current_descriptor:
                                descriptors.append(current_descriptor)
                                current_descriptor = ""
                                state = STATE_AFTER_DESCRIPTOR
                        elif cc == ",":
                            pos = pos + 1
                            if current_descriptor:
                                descriptors.append(current_descriptor)
                                # JUMP to descriptor parser
                            break
                        elif cc == "(":
                            current_descriptor = current_descriptor + cc
                            state = STATE_IN_PARENS
                        elif cc is None:
                            if current_descriptor:
                                descriptors.append(current_descriptor)
                            # JUMP to descriptor parser
                            break
                        else:
                            current_descriptor = current_descriptor + cc
                    elif state == STATE_IN_PARENS:
                        if cc == ")":
                            current_descriptor = current_descriptor + cc
                            state = STATE_IN_DESCRIPTOR
                        elif cc is None:
                            descriptors.append(current_descriptor)
                            # JUMP to descriptor parser
                            break
                        else:
                            current_descriptor = current_descriptor + cc
                    elif state == STATE_AFTER_DESCRIPTOR:
                        if cc in WHITESPACES:
                            pass
                        elif cc is None:
                            # JUMP to descriptor parser
                            break
                        else:
                            state = STATE_IN_DESCRIPTOR
                            pos = pos - 1
                    pos = pos + 1

            # Step 9, 10, 11, 12 (descriptor parser)
            error = False
            width = None
            density = None
            h = None

            # Step 13
            # print("Descriptors", descriptors)
            for descriptor in descriptors:
                if len(descriptor) >= 2:
                    last_char = descriptor[-1]
                    value = descriptor[:-1]
                    if last_char == "w":
                        try:
                            conv_value = int(value)
                        except ValueError:
                            error = True
                        else:
                            if width or density:
                                error = True
                            elif conv_value <= 0:
                                error = True
                            elif not value.isdigit():
                                error = True
                            else:
                                width = value
                    elif last_char == "x":
                        try:
                            conv_value = float(value)
                        except ValueError:
                            error = True
                        else:
                            if width or density or h:
                                error = True
                            elif conv_value < 0:
                                error = True
                            elif value[-1] == ".":
                                error = True
                            elif value[0] == "+":
                                error = True
                            elif math.isinf(conv_value):
                                error = True
                            elif math.isnan(conv_value):
                                error = True
                            else:
                                density = value
                    elif last_char == "h":
                        try:
                            conv_value = int(value)
                        except ValueError:
                            error = True
                        else:
                            if h or density:
                                error = True
                            elif conv_value <= 0:
                                error = True
                            elif not value.isdigit():
                                error = True
                            else:
                                h = value
                    else:
                        error = True
                else:
                    error = True

            if h and not width:
                error = True

            if not error:
                candidates.append({"url": url, "w": width, "x": density, "h": h})

    def stringify(self):
        """
        Returns string which is a valid srcset attribute
        """
        result = ""
        for item in self.candidates:
            if result:
                result = result + ", "
            result = result + item["url"]
            if item["w"]:
                result = result + " %sw" % item["w"]
            if item["x"]:
                result = result + " %sx" % item["x"]
            if item["h"]:
                result = result + " %sh" % item["h"]
        return result


def collect_characters_in(string, start, charset):
    """
    Collect all characters from `start` which are part of the `charset`
    """
    pos = start
    while pos < len(string) and string[pos] in charset:
        pos = pos + 1
    return pos, string[start:pos]


def collect_characters_out(string, start, charset):
    """
    Collect all characters from `start` until one of the characters from `charset`
    is found
    """
    pos = start
    while pos < len(string) and string[pos] not in charset:
        pos = pos + 1
    return pos, string[start:pos]