spin_http/wagi/
mod.rs

1// This file contains code copied from https://github.com/deislabs/wagi
2// The copied code's license is in this directory under LICENSE.wagi
3
4use std::{collections::HashMap, net::SocketAddr};
5
6use anyhow::Error;
7use http::{
8    header::{HeaderName, HOST},
9    request::Parts,
10    HeaderMap, HeaderValue, Response, StatusCode,
11};
12
13use crate::{body, routes::RouteMatch, Body};
14
15/// This sets the version of CGI that WAGI adheres to.
16///
17/// At the point at which WAGI diverges from CGI, this value will be replaced with
18/// WAGI/1.0
19pub const WAGI_VERSION: &str = "CGI/1.1";
20
21/// The CGI-defined "server software version".
22pub const SERVER_SOFTWARE_VERSION: &str = "WAGI/1";
23
24pub fn build_headers(
25    route_match: &RouteMatch,
26    req: &Parts,
27    content_length: usize,
28    client_addr: SocketAddr,
29    default_host: &str,
30    use_tls: bool,
31) -> HashMap<String, String> {
32    let (host, port) = parse_host_header_uri(&req.headers, &req.uri, default_host);
33    let path_info = route_match.trailing_wildcard();
34
35    let mut headers = HashMap::new();
36
37    // CGI headers from RFC
38    headers.insert("AUTH_TYPE".to_owned(), "".to_owned()); // Not currently supported
39
40    // CONTENT_LENGTH (from the spec)
41    // The server MUST set this meta-variable if and only if the request is
42    // accompanied by a message-body entity.  The CONTENT_LENGTH value must
43    // reflect the length of the message-body after the server has removed
44    // any transfer-codings or content-codings.
45    headers.insert("CONTENT_LENGTH".to_owned(), format!("{}", content_length));
46
47    // CONTENT_TYPE (from the spec)
48    // The server MUST set this meta-variable if an HTTP Content-Type field is present
49    // in the client request header.  If the server receives a request with an
50    // attached entity but no Content-Type header field, it MAY attempt to determine
51    // the correct content type, otherwise it should omit this meta-variable.
52    //
53    // Right now, we don't attempt to determine a media type if none is presented.
54    //
55    // The spec seems to indicate that if CONTENT_LENGTH > 0, this may be set
56    // to "application/octet-stream" if no type is otherwise set. Not sure that is
57    // a good idea.
58    headers.insert(
59        "CONTENT_TYPE".to_owned(),
60        req.headers
61            .get("CONTENT_TYPE")
62            .map(|c| c.to_str().unwrap_or(""))
63            .unwrap_or("")
64            .to_owned(),
65    );
66
67    let protocol = if use_tls { "https" } else { "http" };
68
69    // Since this is not in the specification, an X_ is prepended, per spec.
70    // NB: It is strange that there is not a way to do this already. The Display impl
71    // seems to only provide the path.
72    let uri = req.uri.clone();
73    headers.insert(
74        "X_FULL_URL".to_owned(),
75        format!(
76            "{}://{}:{}{}",
77            protocol,
78            host,
79            port,
80            uri.path_and_query().map(|pq| pq.as_str()).unwrap_or("")
81        ),
82    );
83
84    headers.insert("GATEWAY_INTERFACE".to_owned(), WAGI_VERSION.to_owned());
85
86    // This is the Wagi route. This is different from PATH_INFO in that it may
87    // have a trailing '/...'
88    headers.insert(
89        "X_MATCHED_ROUTE".to_owned(),
90        route_match.based_route().to_string(),
91    );
92
93    headers.insert(
94        "QUERY_STRING".to_owned(),
95        req.uri.query().unwrap_or("").to_owned(),
96    );
97
98    headers.insert("REMOTE_ADDR".to_owned(), client_addr.ip().to_string());
99    headers.insert("REMOTE_HOST".to_owned(), client_addr.ip().to_string()); // The server MAY substitute it with REMOTE_ADDR
100    headers.insert("REMOTE_USER".to_owned(), "".to_owned()); // TODO: Parse this out of uri.authority?
101    headers.insert("REQUEST_METHOD".to_owned(), req.method.to_string());
102
103    // The Path component is /$SCRIPT_NAME/$PATH_INFO
104    // SCRIPT_NAME is the route that matched.
105    // https://datatracker.ietf.org/doc/html/rfc3875#section-4.1.13
106    headers.insert(
107        "SCRIPT_NAME".to_owned(),
108        route_match.based_route_or_prefix().to_owned(),
109    );
110    // PATH_INFO is any path information after SCRIPT_NAME
111    //
112    // I am intentionally ignoring the PATH_INFO rule that says that a PATH_INFO
113    // cannot have a path seperator in it. If it becomes important to distinguish
114    // between what was decoded out of the path and what is encoded in the path,
115    // the X_RAW_PATH_INFO can be used.
116    //
117    // https://datatracker.ietf.org/doc/html/rfc3875#section-4.1.5
118    let pathsegment = path_info;
119    let pathinfo = percent_encoding::percent_decode_str(&pathsegment).decode_utf8_lossy();
120    headers.insert(
121        "X_RAW_PATH_INFO".to_owned(),
122        pathsegment.as_ref().to_owned(),
123    );
124    headers.insert("PATH_INFO".to_owned(), pathinfo.to_string());
125    // PATH_TRANSLATED is the url-decoded version of PATH_INFO
126    // https://datatracker.ietf.org/doc/html/rfc3875#section-4.1.6
127    headers.insert("PATH_TRANSLATED".to_owned(), pathinfo.to_string());
128
129    // From the spec: "the server would use the contents of the request's Host header
130    // field to select the correct virtual host."
131    headers.insert("SERVER_NAME".to_owned(), host);
132    headers.insert("SERVER_PORT".to_owned(), port);
133    headers.insert("SERVER_PROTOCOL".to_owned(), format!("{:?}", req.version));
134
135    headers.insert(
136        "SERVER_SOFTWARE".to_owned(),
137        SERVER_SOFTWARE_VERSION.to_owned(),
138    );
139
140    // Normalize incoming HTTP headers. The spec says:
141    // "The HTTP header field name is converted to upper case, has all
142    // occurrences of "-" replaced with "_" and has "HTTP_" prepended to
143    // give the meta-variable name."
144    req.headers.iter().for_each(|header| {
145        let key = format!(
146            "HTTP_{}",
147            header.0.as_str().to_uppercase().replace('-', "_")
148        );
149        // Per spec 4.1.18, skip some headers
150        if key == "HTTP_AUTHORIZATION" || key == "HTTP_CONNECTION" {
151            return;
152        }
153        let val = header.1.to_str().unwrap_or("CORRUPT VALUE").to_owned();
154        headers.insert(key, val);
155    });
156
157    headers
158}
159
160/// Internal utility function for parsing a host header.
161///
162/// This attempts to use three sources to construct a definitive host/port pair, ordering
163/// by precedent.
164///
165/// - The content of the host header is considered most authoritative.
166/// - Next most authoritative is self.host, which is set at the CLI or in the config
167/// - As a last resort, we use the host/port that Hyper gives us.
168/// - If none of these provide sufficient data, which is definitely a possiblity,
169///   we go with `localhost` as host and `80` as port. This, of course, is problematic,
170///   but should only manifest if both the server and the client are behaving badly.
171fn parse_host_header_uri(
172    headers: &HeaderMap,
173    uri: &hyper::Uri,
174    default_host: &str,
175) -> (String, String) {
176    let host_header = headers.get(HOST).and_then(|v| match v.to_str() {
177        Err(_) => None,
178        Ok(s) => Some(s.to_owned()),
179    });
180
181    let mut host = uri
182        .host()
183        .map(|h| h.to_string())
184        .unwrap_or_else(|| "localhost".to_owned());
185    let mut port = uri.port_u16().unwrap_or(80).to_string();
186
187    let mut parse_host = |hdr: String| {
188        let mut parts = hdr.splitn(2, ':');
189        match parts.next() {
190            Some(h) if !h.is_empty() => h.clone_into(&mut host),
191            _ => {}
192        }
193        match parts.next() {
194            Some(p) if !p.is_empty() => {
195                tracing::debug!(port = p, "Overriding port");
196                p.clone_into(&mut port);
197            }
198            _ => {}
199        }
200    };
201
202    // Override with local host field if set.
203    if !default_host.is_empty() {
204        parse_host(default_host.to_owned());
205    }
206
207    // Finally, the value of the HOST header is considered authoritative.
208    // When it comes to port number, the HOST header isn't necessarily 100% trustworthy.
209    // But it appears that this is still the best behavior for the CGI spec.
210    if let Some(hdr) = host_header {
211        parse_host(hdr);
212    }
213
214    (host, port)
215}
216
217pub fn compose_response(stdout: &[u8]) -> Result<Response<Body>, Error> {
218    // Okay, once we get here, all the information we need to send back in the response
219    // should be written to the STDOUT buffer. We fetch that, format it, and send
220    // it back. In the process, we might need to alter the status code of the result.
221    //
222    // This is a little janky, but basically we are looping through the output once,
223    // looking for the double-newline that distinguishes the headers from the body.
224    // The headers can then be parsed separately, while the body can be sent back
225    // to the client.
226    let mut last = 0;
227    let mut scan_headers = true;
228    let mut buffer: Vec<u8> = Vec::new();
229    let mut out_headers: Vec<u8> = Vec::new();
230    stdout.iter().for_each(|i| {
231        // Ignore CR in headers
232        if scan_headers && *i == 13 {
233            return;
234        } else if scan_headers && *i == 10 && last == 10 {
235            out_headers.append(&mut buffer);
236            buffer = Vec::new();
237            scan_headers = false;
238            return; // Consume the linefeed
239        }
240        last = *i;
241        buffer.push(*i)
242    });
243    let mut res = Response::new(body::full(buffer.into()));
244    let mut sufficient_response = false;
245    let mut explicit_status_code = false;
246    parse_cgi_headers(String::from_utf8(out_headers)?)
247        .iter()
248        .for_each(|h| {
249            use hyper::header::{CONTENT_TYPE, LOCATION};
250            match h.0.to_lowercase().as_str() {
251                "content-type" => {
252                    sufficient_response = true;
253                    res.headers_mut().insert(CONTENT_TYPE, h.1.parse().unwrap());
254                }
255                "status" => {
256                    // The spec does not say that status is a sufficient response.
257                    // (It says that it may be added along with Content-Type, because
258                    // a status has a content type). However, CGI libraries in the wild
259                    // do not set content type correctly if a status is an error.
260                    // See https://datatracker.ietf.org/doc/html/rfc3875#section-6.2
261                    sufficient_response = true;
262                    explicit_status_code = true;
263                    // Status can be `Status CODE [STRING]`, and we just want the CODE.
264                    let status_code = h.1.split_once(' ').map(|(code, _)| code).unwrap_or(h.1);
265                    tracing::debug!(status_code, "Raw status code");
266                    match status_code.parse::<StatusCode>() {
267                        Ok(code) => *res.status_mut() = code,
268                        Err(e) => {
269                            tracing::warn!("Failed to parse code: {}", e);
270                            *res.status_mut() = StatusCode::BAD_GATEWAY;
271                        }
272                    }
273                }
274                "location" => {
275                    sufficient_response = true;
276                    res.headers_mut()
277                        .insert(LOCATION, HeaderValue::from_str(h.1).unwrap());
278                    if !explicit_status_code {
279                        *res.status_mut() = StatusCode::from_u16(302).unwrap();
280                    }
281                }
282                _ => {
283                    // If the header can be parsed into a valid HTTP header, it is
284                    // added to the headers. Otherwise it is ignored.
285                    match HeaderName::from_lowercase(h.0.as_str().to_lowercase().as_bytes()) {
286                        Ok(hdr) => {
287                            res.headers_mut()
288                                .insert(hdr, HeaderValue::from_str(h.1).unwrap());
289                        }
290                        Err(e) => {
291                            tracing::error!(error = %e, header_name = %h.0, "Invalid header name")
292                        }
293                    }
294                }
295            }
296        });
297    if !sufficient_response {
298        tracing::debug!("{:?}", res.body());
299        return Ok(internal_error(
300            // Technically, we let `status` be sufficient, but this is more lenient
301            // than the specification.
302            "Exactly one of 'location' or 'content-type' must be specified",
303        ));
304    }
305    Ok(res)
306}
307
308fn parse_cgi_headers(headers: String) -> HashMap<String, String> {
309    let mut map = HashMap::new();
310    headers.trim().split('\n').for_each(|h| {
311        let parts: Vec<&str> = h.splitn(2, ':').collect();
312        if parts.len() != 2 {
313            tracing::warn!(header = h, "corrupt header");
314            return;
315        }
316        map.insert(parts[0].trim().to_owned(), parts[1].trim().to_owned());
317    });
318    map
319}
320
321/// Create an HTTP 500 response
322fn internal_error(msg: impl std::string::ToString) -> Response<Body> {
323    let message = msg.to_string();
324    tracing::error!(error = %message, "HTTP 500 error");
325    let mut res = Response::new(body::full(message.into()));
326    *res.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
327    res
328}