// Copyright (c) Wikimedia Foundation and contributors. // All Rights Reserved. // // This file is part of GitLab Content Proxy. // // GitLab Content Proxy is free software: you can redistribute it and/or // modify it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or (at your // option) any later version. // // GitLab Content Proxy is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for // more details. // // You should have received a copy of the GNU General Public License along // with this program. If not, see . package main import ( "bytes" "fmt" slogenv "github.com/cbrewster/slog-env" "io" "log" "log/slog" "net/http" "net/http/httputil" "net/url" "os" "regexp" "slices" "strconv" "strings" ) // Root URL of upstream gitlab service const UPSTREAM_URL = "https://gitlab.wikimedia.org" // User-Agent to present to UPSTREAM_URL when acting as proxy const USER_AGENT = "gitlab-content (https://wikitech.wikimedia.org/wiki/Tool:Gitlab-content; gitlab-content.maintainers@toolforge.org)" // Safe characters for an HTTP header payload const RE_HEADER_SAFE = "[^A-Za-z0-9\\-_.~!#$&\\'()*+,/:;=?@\\[\\] ]" // Body for our synthetic 404 Not Found responses. const BODY_NOTFOUND = "Upstream url not found.\n" // Body for 400 Bad Request responses. const BODY_BADREQUEST = "/-/raw/ not found in URL.\n" // Format string for Cache-Control header. // Arguments: // - 1: max time to reuse a cached response in seconds const FMT_CACHECONTROL = "public, max-age=%d, must-revalidate" // Index page for app. Expects to be expanded with fmt.Sprintf. // Arguments: // - 1: fully qualified hostname (and port if not :443) const INDEX_HTML = ` Wikimedia GitLab Content Proxy

Wikimedia GitLab Content Proxy

A reverse proxy to serve gitlab.wikimedia.org files with appropriate mime types so they can be imported by mw.loader.load.

Usage

https://%[1]s/{REPO}/-/raw/{PATH}?mime={MIME}&maxage={MAXAGE}

{REPO}
gitlab.wikimedia.org repo name
{PATH}
path to file within the repo including branch name or commit hash
{MIME}
mime type to apply to the content (default: text/plain)
{MAXAGE}
Set the max-age HTTP cache control header to this many seconds. Errors are never cached. (optional)

The URL format matches gitlab's internal URLs for raw content. This should help folks find the right URL by browsing in gitlab.wikimedia.org, changing the hostname to gitlab-content.toolforge.org, and adding the desired mime type specification.

Example: https://%[1]s/toolforge-repos/gitlab-content/-/raw/main/main.go?mime=text/plain;+charset=utf-8&maxage=86400

` func main() { // Output structured logs to stderr as JSON lines. logger := slog.New(slogenv.NewHandler(slog.NewJSONHandler(os.Stderr, nil))) slog.SetDefault(logger) slog.Info("Creating reverse proxy", "upstream", UPSTREAM_URL) upstream, err := url.Parse(UPSTREAM_URL) if err != nil { log.Fatal("Invalid origin server URL") } proxy := httputil.NewSingleHostReverseProxy(upstream) // Use proxy.Director to modify upstream requests. // - Replace User-Agent header with our own origDirector := proxy.Director proxy.Director = func(req *http.Request) { origDirector(req) req.Header.Set("User-Agent", USER_AGENT) // Drop Accept-Encoding header so the upstream doesn't compress its // response. The Toolforge reverse proxy adds gzip automatically based // on mime-type and double encoding makes everyone sad. req.Header.Del("Accept-Encoding") } headerSafe := regexp.MustCompile(RE_HEADER_SAFE) allowedResponseCodes := []int{ 200, 304, 403, 404, 408, 410, 412, 413, 414, 415, 416, 417, 421, 429, 431, 451, 500, 501, 502, 503, 504, } allowedHeaders := []string{ "Cache-Control", "Content-Disposition", "Content-Length", "Content-Location", "Content-Type", "Date", "Etag", "Expires", "Retry-After", "Transfer-Encoding", "Vary", } // Use proxy.ModifyResponse to modify response to client. // - Turn unexpected status codes into 404s // - Add user supplied Content-Type or default "text/plain" proxy.ModifyResponse = func(resp *http.Response) error { req := resp.Request slog.Debug( "Upstream response", "url", req.URL.String(), "status", resp.StatusCode, ) if !slices.Contains(allowedResponseCodes, resp.StatusCode) { // We got some upstream response that isn't content or something // that we feel like we can just hand back to the caller. // Return a 404 because the actual code may be a 301 to some // login screen or something. resp.StatusCode = http.StatusNotFound resp.Status = http.StatusText(http.StatusNotFound) resp.Body = io.NopCloser(bytes.NewBufferString(BODY_NOTFOUND)) resp.ContentLength = int64(len(BODY_NOTFOUND)) resp.Header = make(http.Header, 0) resp.Header.Set("Content-Length", strconv.Itoa(len(BODY_NOTFOUND))) resp.Header.Set("Content-Type", "text/plain; charset=utf-8") resp.Header.Set("Cache-Control", "no-store") slog.Debug("Not Found", "url", req.URL.String()) return nil } // Pass through headers in the allow list respHeaders := resp.Header resp.Header = make(http.Header, 0) for _, header := range allowedHeaders { for _, value := range respHeaders.Values(header) { resp.Header.Add(header, value) } } if resp.StatusCode == 200 { // Change Content-Type header for found content mime := "text/plain; charset=utf-8" if req.URL.Query().Get("mime") != "" { mime = headerSafe.ReplaceAllString( req.URL.Query().Get("mime"), "", ) } resp.Header.Set("Content-Type", mime) // Change Cache-Control header if requested if req.URL.Query().Get("maxage") != "" { maxage, err := strconv.Atoi(req.URL.Query().Get("maxage")) if err == nil { resp.Header.Set( "Cache-Control", fmt.Sprintf(FMT_CACHECONTROL, maxage), ) } } } slog.Debug( "Request", "url", req.URL.String(), "status", resp.StatusCode, ) return nil } // Serve INDEX_HTML static page for GET / route http.HandleFunc("GET /{$}", func(w http.ResponseWriter, r *http.Request) { slog.Debug("Request", "url", r.URL.String()) w.WriteHeader(http.StatusOK) w.Header().Set("Content-Type", "text/html; charset=utf-8") w.Header().Set("Cache-Control", "public, max-age=3600, must-revalidate") _, err := io.WriteString(w, fmt.Sprintf(INDEX_HTML, r.Host)) if err != nil { log.Fatal(err) } }) // Handle all unhandled GET requests. // - Reverse proxy to UPSTREAM_URL if path contains `/-/raw/` // - Return 400 Bad Request otherwise http.HandleFunc("GET /", func(w http.ResponseWriter, r *http.Request) { if strings.Contains(r.URL.Path, "/-/raw/") { proxy.ServeHTTP(w, r) } else { slog.Debug("Bad request", "url", r.URL.String()) w.WriteHeader(http.StatusBadRequest) w.Header().Set("Content-Type", "text/plain; charset=utf-8") _, err := io.WriteString(w, BODY_BADREQUEST) if err != nil { log.Fatal(err) } } }) // Listen for HTTP requests on port 8000 log.Fatal(http.ListenAndServe(":8000", nil)) }