From 5e31f66313e2dc15fb9e75395504feb0992c3feb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaakko=20Kera=CC=88nen?= jaakko.keranen@iki.fi
Date: Mon, 27 Dec 2021 10:00:17 +0200
Subject: [PATCH 1/1] Reserved characters in URLs
Making URL encoding a little less convoluted. Now when sending out a request, the URL is fully encoded except for reserved characters. In the internal representation, non-ASCII characters are in decoded form (i.e., IRI).
This means that if the user enters a URL in the input field manually, its non-ASCII characters will be percent encoded as well. However, in this case the user is expected to manually escape all reserved characters because the input field can't tell the difference between what is intended to be a reserved separator and what isn't. For example, a server might expect &-separated fields, and if the user enters such fields manually in the URL field, they shouldn't be converted to %26.
When forming a query URL in the input dialog, user-entered text is fully percent-encoded because in that case the input is just a generic text string.
IssueID #410
src/app.c | 6 ------
src/gmrequest.c | 6 ++++--
src/gmutil.c | 30 ++++++++++++++++++++++++++----
src/gmutil.h | 2 ++
src/ui/inputwidget.c | 10 ++++++++--
5 files changed, 40 insertions(+), 14 deletions(-)
diff --git a/src/app.c b/src/app.c
index 3b4c24f0..73cc35ee 100644
--- a/src/app.c
+++ b/src/app.c
@@ -2791,12 +2791,6 @@ iBool handleCommand_App(const char *cmd) {
setRedirectCount_DocumentWidget(doc, redirectCount);
setOrigin_DocumentWidget(doc, origin);
showCollapsed_Widget(findWidget_App("document.progress"), iFalse);
if (prefs_App()->decodeUserVisibleURLs) {
urlDecodePath_String(url);
}
else {
urlEncodePath_String(url);
}
setUrlFlags_DocumentWidget(doc, url,
isHistory ? useCachedContentIfAvailable_DocumentWidgetSetUrlFlag : 0);
/* Optionally, jump to a text in the document. This will only work if the document
diff --git a/src/gmrequest.c b/src/gmrequest.c
index a9c5919d..c23e8499 100644
--- a/src/gmrequest.c
+++ b/src/gmrequest.c
@@ -585,8 +585,10 @@ void setUrl_GmRequest(iGmRequest *d, const iString *url) {
/* TODO: Gemini spec allows UTF-8 encoded URLs, but still need to percent-encode non-ASCII
characters? Could be a server-side issue, e.g., if they're using a URL parser meant for
the web. */
d->identity = identityForUrl_GmCerts(d->certs, &d->url);
}
diff --git a/src/gmutil.c b/src/gmutil.c
index 79462e41..98e4d4d6 100644
--- a/src/gmutil.c
+++ b/src/gmutil.c
@@ -330,6 +330,28 @@ void urlEncodePath_String(iString *d) {
delete_String(encoded);
}
+void urlEncodeQuery_String(iString *d) {
return;
+}
iBool isKnownScheme_Rangecc(iRangecc scheme) {
if (isKnownUrlScheme_Rangecc(scheme)) {
return iTrue;
@@ -667,20 +689,20 @@ const iString *canonicalUrl_String(const iString *d) {
iString *canon = NULL;
iUrl parts;
init_Url(&parts, d);
if (iStrStrN(parts.path.start, "%3A", size_Range(&parts.path)) ||
iStrStrN(parts.path.start, "%3a", size_Range(&parts.path))) {
/* This is done separately to avoid the copy if %3A is not present; it's rare. */
canon = copy_String(d);
urlDecodePath_String(canon);
iString *dec = maybeUrlDecodeExclude_String(canon, "%/?:;#&+= "); /* decode everything else in all parts */
iString *dec = maybeUrlDecodeExclude_String(canon, "% " URL_RESERVED_CHARS); /* decode everything else in all parts */
if (dec) {
set_String(canon, dec);
delete_String(dec);
}
}
else {
canon = maybeUrlDecodeExclude_String(d, "%/?:;#&+= ");
canon = maybeUrlDecodeExclude_String(d, "% " URL_RESERVED_CHARS);
}
/* `canon` may now be NULL if nothing was decoded. */
if (indexOfCStr_String(canon ? canon : d, " ") != iInvalidPos ||
@@ -689,7 +711,7 @@ const iString *canonicalUrl_String(const iString *d) {
canon = copy_String(d);
}
urlEncodeSpaces_String(canon);
return canon ? collect_String(canon) : d;
}
diff --git a/src/gmutil.h b/src/gmutil.h
index 6d337eeb..15bb7b2e 100644
--- a/src/gmutil.h
+++ b/src/gmutil.h
@@ -100,6 +100,7 @@ iRegExp * newGemtextLink_RegExp (void);
#define GEMINI_DEFAULT_PORT ((uint16_t) 1965)
#define GEMINI_DEFAULT_PORT_CSTR "1965"
+#define URL_RESERVED_CHARS ":/?#[]@!$&'()+,;=" / RFC 3986 */
struct Impl_Url {
iRangecc scheme;
@@ -131,6 +132,7 @@ const iString * urlFragmentStripped_String(const iString *);
const iString * urlQueryStripped_String (const iString *);
void urlDecodePath_String (iString *);
void urlEncodePath_String (iString *);
+void urlEncodeQuery_String (iString *);
iString * makeFileUrl_String (const iString *localFilePath);
const char * makeFileUrl_CStr (const char *localFilePath);
iString * localFilePathFromUrl_String(const iString *);
diff --git a/src/ui/inputwidget.c b/src/ui/inputwidget.c
index b94e0c27..24983d69 100644
--- a/src/ui/inputwidget.c
+++ b/src/ui/inputwidget.c
@@ -1100,9 +1100,15 @@ static void updateBuffered_InputWidget_(iInputWidget *d) {
void setText_InputWidget(iInputWidget *d, const iString *text) {
if (!d) return;
if (d->inFlags & isUrl_InputWidgetFlag) {
/* If user wants URLs encoded, also Punycode the domain. */
if (!prefs_App()->decodeUserVisibleURLs) {
if (prefs_App()->decodeUserVisibleURLs) {
iString *enc = collect_String(copy_String(text));
urlDecodePath_String(enc);
text = enc;
}
else {
/* The user wants URLs encoded, also Punycode the domain. */
iString *enc = collect_String(copy_String(text));
urlEncodePath_String(enc);
/* Prevent address bar spoofing (mentioned as IDN homograph attack in
https://github.com/skyjake/lagrange/issues/73) */
punyEncodeUrlHost_String(enc);
--
2.25.1
text/plain
This content has been proxied by September (ba2dc).