spin_http/wagi/mod.rs
1// This file contains code copied from https://github.com/deislabs/wagi
2// The copied code's license is in this directory under LICENSE.wagi
3
4use std::{collections::HashMap, net::SocketAddr};
5
6use anyhow::Error;
7use http::{
8 header::{HeaderName, HOST},
9 request::Parts,
10 HeaderMap, HeaderValue, Response, StatusCode,
11};
12
13use crate::{body, routes::RouteMatch, Body};
14
15/// This sets the version of CGI that WAGI adheres to.
16///
17/// At the point at which WAGI diverges from CGI, this value will be replaced with
18/// WAGI/1.0
19pub const WAGI_VERSION: &str = "CGI/1.1";
20
21/// The CGI-defined "server software version".
22pub const SERVER_SOFTWARE_VERSION: &str = "WAGI/1";
23
24pub fn build_headers(
25 route_match: &RouteMatch,
26 req: &Parts,
27 content_length: usize,
28 client_addr: SocketAddr,
29 default_host: &str,
30 use_tls: bool,
31) -> HashMap<String, String> {
32 let (host, port) = parse_host_header_uri(&req.headers, &req.uri, default_host);
33 let path_info = route_match.trailing_wildcard();
34
35 let mut headers = HashMap::new();
36
37 // CGI headers from RFC
38 headers.insert("AUTH_TYPE".to_owned(), "".to_owned()); // Not currently supported
39
40 // CONTENT_LENGTH (from the spec)
41 // The server MUST set this meta-variable if and only if the request is
42 // accompanied by a message-body entity. The CONTENT_LENGTH value must
43 // reflect the length of the message-body after the server has removed
44 // any transfer-codings or content-codings.
45 headers.insert("CONTENT_LENGTH".to_owned(), format!("{}", content_length));
46
47 // CONTENT_TYPE (from the spec)
48 // The server MUST set this meta-variable if an HTTP Content-Type field is present
49 // in the client request header. If the server receives a request with an
50 // attached entity but no Content-Type header field, it MAY attempt to determine
51 // the correct content type, otherwise it should omit this meta-variable.
52 //
53 // Right now, we don't attempt to determine a media type if none is presented.
54 //
55 // The spec seems to indicate that if CONTENT_LENGTH > 0, this may be set
56 // to "application/octet-stream" if no type is otherwise set. Not sure that is
57 // a good idea.
58 headers.insert(
59 "CONTENT_TYPE".to_owned(),
60 req.headers
61 .get("CONTENT_TYPE")
62 .map(|c| c.to_str().unwrap_or(""))
63 .unwrap_or("")
64 .to_owned(),
65 );
66
67 let protocol = if use_tls { "https" } else { "http" };
68
69 // Since this is not in the specification, an X_ is prepended, per spec.
70 // NB: It is strange that there is not a way to do this already. The Display impl
71 // seems to only provide the path.
72 let uri = req.uri.clone();
73 headers.insert(
74 "X_FULL_URL".to_owned(),
75 format!(
76 "{}://{}:{}{}",
77 protocol,
78 host,
79 port,
80 uri.path_and_query().map(|pq| pq.as_str()).unwrap_or("")
81 ),
82 );
83
84 headers.insert("GATEWAY_INTERFACE".to_owned(), WAGI_VERSION.to_owned());
85
86 // This is the Wagi route. This is different from PATH_INFO in that it may
87 // have a trailing '/...'
88 headers.insert(
89 "X_MATCHED_ROUTE".to_owned(),
90 route_match.based_route().to_string(),
91 );
92
93 headers.insert(
94 "QUERY_STRING".to_owned(),
95 req.uri.query().unwrap_or("").to_owned(),
96 );
97
98 headers.insert("REMOTE_ADDR".to_owned(), client_addr.ip().to_string());
99 headers.insert("REMOTE_HOST".to_owned(), client_addr.ip().to_string()); // The server MAY substitute it with REMOTE_ADDR
100 headers.insert("REMOTE_USER".to_owned(), "".to_owned()); // TODO: Parse this out of uri.authority?
101 headers.insert("REQUEST_METHOD".to_owned(), req.method.to_string());
102
103 // The Path component is /$SCRIPT_NAME/$PATH_INFO
104 // SCRIPT_NAME is the route that matched.
105 // https://datatracker.ietf.org/doc/html/rfc3875#section-4.1.13
106 headers.insert(
107 "SCRIPT_NAME".to_owned(),
108 route_match.based_route_or_prefix().to_owned(),
109 );
110 // PATH_INFO is any path information after SCRIPT_NAME
111 //
112 // I am intentionally ignoring the PATH_INFO rule that says that a PATH_INFO
113 // cannot have a path seperator in it. If it becomes important to distinguish
114 // between what was decoded out of the path and what is encoded in the path,
115 // the X_RAW_PATH_INFO can be used.
116 //
117 // https://datatracker.ietf.org/doc/html/rfc3875#section-4.1.5
118 let pathsegment = path_info;
119 let pathinfo = percent_encoding::percent_decode_str(&pathsegment).decode_utf8_lossy();
120 headers.insert(
121 "X_RAW_PATH_INFO".to_owned(),
122 pathsegment.as_ref().to_owned(),
123 );
124 headers.insert("PATH_INFO".to_owned(), pathinfo.to_string());
125 // PATH_TRANSLATED is the url-decoded version of PATH_INFO
126 // https://datatracker.ietf.org/doc/html/rfc3875#section-4.1.6
127 headers.insert("PATH_TRANSLATED".to_owned(), pathinfo.to_string());
128
129 // From the spec: "the server would use the contents of the request's Host header
130 // field to select the correct virtual host."
131 headers.insert("SERVER_NAME".to_owned(), host);
132 headers.insert("SERVER_PORT".to_owned(), port);
133 headers.insert("SERVER_PROTOCOL".to_owned(), format!("{:?}", req.version));
134
135 headers.insert(
136 "SERVER_SOFTWARE".to_owned(),
137 SERVER_SOFTWARE_VERSION.to_owned(),
138 );
139
140 // Normalize incoming HTTP headers. The spec says:
141 // "The HTTP header field name is converted to upper case, has all
142 // occurrences of "-" replaced with "_" and has "HTTP_" prepended to
143 // give the meta-variable name."
144 req.headers.iter().for_each(|header| {
145 let key = format!(
146 "HTTP_{}",
147 header.0.as_str().to_uppercase().replace('-', "_")
148 );
149 // Per spec 4.1.18, skip some headers
150 if key == "HTTP_AUTHORIZATION" || key == "HTTP_CONNECTION" {
151 return;
152 }
153 let val = header.1.to_str().unwrap_or("CORRUPT VALUE").to_owned();
154 headers.insert(key, val);
155 });
156
157 headers
158}
159
160/// Internal utility function for parsing a host header.
161///
162/// This attempts to use three sources to construct a definitive host/port pair, ordering
163/// by precedent.
164///
165/// - The content of the host header is considered most authoritative.
166/// - Next most authoritative is self.host, which is set at the CLI or in the config
167/// - As a last resort, we use the host/port that Hyper gives us.
168/// - If none of these provide sufficient data, which is definitely a possiblity,
169/// we go with `localhost` as host and `80` as port. This, of course, is problematic,
170/// but should only manifest if both the server and the client are behaving badly.
171fn parse_host_header_uri(
172 headers: &HeaderMap,
173 uri: &hyper::Uri,
174 default_host: &str,
175) -> (String, String) {
176 let host_header = headers.get(HOST).and_then(|v| match v.to_str() {
177 Err(_) => None,
178 Ok(s) => Some(s.to_owned()),
179 });
180
181 let mut host = uri
182 .host()
183 .map(|h| h.to_string())
184 .unwrap_or_else(|| "localhost".to_owned());
185 let mut port = uri.port_u16().unwrap_or(80).to_string();
186
187 let mut parse_host = |hdr: String| {
188 let mut parts = hdr.splitn(2, ':');
189 match parts.next() {
190 Some(h) if !h.is_empty() => h.clone_into(&mut host),
191 _ => {}
192 }
193 match parts.next() {
194 Some(p) if !p.is_empty() => {
195 tracing::debug!(port = p, "Overriding port");
196 p.clone_into(&mut port);
197 }
198 _ => {}
199 }
200 };
201
202 // Override with local host field if set.
203 if !default_host.is_empty() {
204 parse_host(default_host.to_owned());
205 }
206
207 // Finally, the value of the HOST header is considered authoritative.
208 // When it comes to port number, the HOST header isn't necessarily 100% trustworthy.
209 // But it appears that this is still the best behavior for the CGI spec.
210 if let Some(hdr) = host_header {
211 parse_host(hdr);
212 }
213
214 (host, port)
215}
216
217pub fn compose_response(stdout: &[u8]) -> Result<Response<Body>, Error> {
218 // Okay, once we get here, all the information we need to send back in the response
219 // should be written to the STDOUT buffer. We fetch that, format it, and send
220 // it back. In the process, we might need to alter the status code of the result.
221 //
222 // This is a little janky, but basically we are looping through the output once,
223 // looking for the double-newline that distinguishes the headers from the body.
224 // The headers can then be parsed separately, while the body can be sent back
225 // to the client.
226 let mut last = 0;
227 let mut scan_headers = true;
228 let mut buffer: Vec<u8> = Vec::new();
229 let mut out_headers: Vec<u8> = Vec::new();
230 stdout.iter().for_each(|i| {
231 // Ignore CR in headers
232 if scan_headers && *i == 13 {
233 return;
234 } else if scan_headers && *i == 10 && last == 10 {
235 out_headers.append(&mut buffer);
236 buffer = Vec::new();
237 scan_headers = false;
238 return; // Consume the linefeed
239 }
240 last = *i;
241 buffer.push(*i)
242 });
243 let mut res = Response::new(body::full(buffer.into()));
244 let mut sufficient_response = false;
245 let mut explicit_status_code = false;
246 parse_cgi_headers(String::from_utf8(out_headers)?)
247 .iter()
248 .for_each(|h| {
249 use hyper::header::{CONTENT_TYPE, LOCATION};
250 match h.0.to_lowercase().as_str() {
251 "content-type" => {
252 sufficient_response = true;
253 res.headers_mut().insert(CONTENT_TYPE, h.1.parse().unwrap());
254 }
255 "status" => {
256 // The spec does not say that status is a sufficient response.
257 // (It says that it may be added along with Content-Type, because
258 // a status has a content type). However, CGI libraries in the wild
259 // do not set content type correctly if a status is an error.
260 // See https://datatracker.ietf.org/doc/html/rfc3875#section-6.2
261 sufficient_response = true;
262 explicit_status_code = true;
263 // Status can be `Status CODE [STRING]`, and we just want the CODE.
264 let status_code = h.1.split_once(' ').map(|(code, _)| code).unwrap_or(h.1);
265 tracing::debug!(status_code, "Raw status code");
266 match status_code.parse::<StatusCode>() {
267 Ok(code) => *res.status_mut() = code,
268 Err(e) => {
269 tracing::warn!("Failed to parse code: {}", e);
270 *res.status_mut() = StatusCode::BAD_GATEWAY;
271 }
272 }
273 }
274 "location" => {
275 sufficient_response = true;
276 res.headers_mut()
277 .insert(LOCATION, HeaderValue::from_str(h.1).unwrap());
278 if !explicit_status_code {
279 *res.status_mut() = StatusCode::from_u16(302).unwrap();
280 }
281 }
282 _ => {
283 // If the header can be parsed into a valid HTTP header, it is
284 // added to the headers. Otherwise it is ignored.
285 match HeaderName::from_lowercase(h.0.as_str().to_lowercase().as_bytes()) {
286 Ok(hdr) => {
287 res.headers_mut()
288 .insert(hdr, HeaderValue::from_str(h.1).unwrap());
289 }
290 Err(e) => {
291 tracing::error!(error = %e, header_name = %h.0, "Invalid header name")
292 }
293 }
294 }
295 }
296 });
297 if !sufficient_response {
298 tracing::debug!("{:?}", res.body());
299 return Ok(internal_error(
300 // Technically, we let `status` be sufficient, but this is more lenient
301 // than the specification.
302 "Exactly one of 'location' or 'content-type' must be specified",
303 ));
304 }
305 Ok(res)
306}
307
308fn parse_cgi_headers(headers: String) -> HashMap<String, String> {
309 let mut map = HashMap::new();
310 headers.trim().split('\n').for_each(|h| {
311 let parts: Vec<&str> = h.splitn(2, ':').collect();
312 if parts.len() != 2 {
313 tracing::warn!(header = h, "corrupt header");
314 return;
315 }
316 map.insert(parts[0].trim().to_owned(), parts[1].trim().to_owned());
317 });
318 map
319}
320
321/// Create an HTTP 500 response
322fn internal_error(msg: impl std::string::ToString) -> Response<Body> {
323 let message = msg.to_string();
324 tracing::error!(error = %message, "HTTP 500 error");
325 let mut res = Response::new(body::full(message.into()));
326 *res.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
327 res
328}