Re: PATCH: Add uri percent-encoding for binary data

From: Bruce Momjian <bruce(at)momjian(dot)us>
To: Anders Åstrand <anders(at)449(dot)se>
Cc: pgsql-hackers(at)postgresql(dot)org
Subject: Re: PATCH: Add uri percent-encoding for binary data
Date: 2019-10-07 19:52:41
Message-ID: 20191007195241.GJ4732@momjian.us
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers

On Mon, Oct 7, 2019 at 09:14:38AM +0200, Anders Åstrand wrote:
> Hello
>
> Attached is a patch for adding uri as an encoding option for
> encode/decode. It uses what's called "percent-encoding" in rfc3986
> (https://tools.ietf.org/html/rfc3986#section-2.1).

Oh, that's a cool idea. Can you add it to the commit-fest?

https://commitfest.postgresql.org/25/

---------------------------------------------------------------------------

>
> The background for this patch is that I could easily build urls in
> plpgsql, but doing the actual encoding of the url parts is painfully
> slow. The list of available encodings for encode/decode looks quite
> arbitrary to me, so I can't see any reason this one couldn't be in
> there.
>
> In modern web scenarios one would probably most likely want to encode
> the utf8 representation of a text string for inclusion in a url, in
> which case correct invocation would be ENCODE(CONVERT_TO('some text in
> database encoding goes here', 'UTF8'), 'uri'), but uri
> percent-encoding can of course also be used for other text encodings
> and arbitrary binary data.
>
> Regards,
> Anders

> diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
> index 7293d66de5..33cf7bb57c 100644
> --- a/src/backend/utils/adt/encode.c
> +++ b/src/backend/utils/adt/encode.c
> @@ -512,6 +512,131 @@ esc_dec_len(const char *src, unsigned srclen)
> return len;
> }
>
> +/*
> + * URI percent encoding
> + *
> + * Percent encodes all byte values except the unreserved ASCII characters as per RFC3986.
> + */
> +
> +static const char upper_hex_digits[] = "0123456789ABCDEF";
> +
> +static unsigned
> +uri_encode(const char *src, unsigned srclen, char *dst)
> +{
> + char *d = dst;
> +
> + for (const char *s = src; s < src + srclen; s++)
> + {
> + if ((*s >= 'A' && *s <= 'Z') ||
> + (*s >= 'a' && *s <= 'z') ||
> + (*s >= '0' && *s <= '9') ||
> + *s == '-' ||
> + *s == '_' ||
> + *s == '.' ||
> + *s == '~')
> + {
> + *d++ = *s;
> + }
> + else
> + {
> + *d++ = '%';
> + *d++ = upper_hex_digits[(*s >> 4) & 0xF];
> + *d++ = upper_hex_digits[*s & 0xF];
> + }
> + }
> + return d - dst;
> +}
> +
> +static unsigned
> +uri_decode(const char *src, unsigned srclen, char *dst)
> +{
> + const char *s = src;
> + const char *srcend = src + srclen;
> + char *d = dst;
> + char val;
> +
> + while (s < srcend)
> + {
> + if (*s == '%')
> + {
> + if (s > srcend - 3) {
> + /* This will never get triggered since uri_dec_len already takes care of validation
> + */
> + ereport(ERROR,
> + (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> + errmsg("invalid uri percent encoding"),
> + errhint("Input data ends prematurely.")));
> + }
> +
> + /* Skip '%' */
> + s++;
> +
> + val = get_hex(*s++) << 4;
> + val += get_hex(*s++);
> + *d++ = val;
> + }
> + else
> + {
> + *d++ = *s++;
> + }
> + }
> + return d - dst;
> +}
> +
> +static unsigned
> +uri_enc_len(const char *src, unsigned srclen)
> +{
> + int len = 0;
> +
> + for (const char *s = src; s < src + srclen; s++)
> + {
> + if ((*s >= 'A' && *s <= 'Z') ||
> + (*s >= 'a' && *s <= 'z') ||
> + (*s >= '0' && *s <= '9') ||
> + *s == '-' ||
> + *s == '_' ||
> + *s == '.' ||
> + *s == '~')
> + {
> + len++;
> + }
> + else
> + {
> + len += 3;
> + }
> + }
> + return len;
> +}
> +
> +static unsigned
> +uri_dec_len(const char *src, unsigned srclen)
> +{
> + const char *s = src;
> + const char *srcend = src + srclen;
> + int len = 0;
> +
> + while (s < srcend)
> + {
> + if (*s == '%')
> + {
> + if (s > srcend - 3) {
> + ereport(ERROR,
> + (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> + errmsg("invalid uri percent encoding"),
> + errhint("Input data ends prematurely.")));
> + }
> + s++;
> + get_hex(*s++);
> + get_hex(*s++);
> + }
> + else {
> + s++;
> + }
> + len++;
> + }
> + return len;
> +}
> +
> /*
> * Common
> */
> @@ -541,6 +666,12 @@ static const struct
> esc_enc_len, esc_dec_len, esc_encode, esc_decode
> }
> },
> + {
> + "uri",
> + {
> + uri_enc_len, uri_dec_len, uri_encode, uri_decode
> + }
> + },
> {
> NULL,
> {
> diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
> index 2483966576..f89c5ec1c3 100644
> --- a/src/test/regress/expected/strings.out
> +++ b/src/test/regress/expected/strings.out
> @@ -1870,3 +1870,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
> Th\000o\x02\x03
> (1 row)
>
> +SET bytea_output TO hex;
> +SELECT encode(E'en\\300\\336d'::bytea, 'uri');
> + encode
> +-----------
> + en%C0%DEd
> +(1 row)
> +
> +SELECT decode('%De%c0%DEd', 'uri');
> + decode
> +------------
> + \xdec0de64
> +(1 row)
> +
> +SELECT decode('error%Ex', 'uri');
> +ERROR: invalid hexadecimal digit: "x"
> +SELECT decode('error%E', 'uri');
> +ERROR: invalid uri percent encoding
> +HINT: Input data ends prematurely.
> +SELECT decode('error%', 'uri');
> +ERROR: invalid uri percent encoding
> +HINT: Input data ends prematurely.
> diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
> index b5e75c344f..1d03836b6e 100644
> --- a/src/test/regress/sql/strings.sql
> +++ b/src/test/regress/sql/strings.sql
> @@ -641,3 +641,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
> SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
> SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
> SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
> +
> +SET bytea_output TO hex;
> +SELECT encode(E'en\\300\\336d'::bytea, 'uri');
> +SELECT decode('%De%c0%DEd', 'uri');
> +SELECT decode('error%Ex', 'uri');
> +SELECT decode('error%E', 'uri');
> +SELECT decode('error%', 'uri');

--
Bruce Momjian <bruce(at)momjian(dot)us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +

In response to

Responses

Browse pgsql-hackers by date

  From Date Subject
Next Message Robert Haas 2019-10-07 19:55:58 Re: Missed check for too-many-children in bgworker spawning
Previous Message Robert Haas 2019-10-07 19:50:06 Re: Transparent Data Encryption (TDE) and encrypted files