spin_http/wagi/mod.rs
1// This file contains code copied from https://github.com/deislabs/wagi
2// The copied code's license is in this directory under LICENSE.wagi
3
4use std::{collections::HashMap, net::SocketAddr};
5
6use anyhow::Error;
7use http::{
8 HeaderMap, HeaderValue, Response, StatusCode,
9 header::{HOST, HeaderName},
10 request::Parts,
11};
12
13use crate::{Body, body, routes::RouteMatch};
14
15/// This sets the version of CGI that WAGI adheres to.
16///
17/// At the point at which WAGI diverges from CGI, this value will be replaced with
18/// WAGI/1.0
19pub const WAGI_VERSION: &str = "CGI/1.1";
20
21/// The CGI-defined "server software version".
22pub const SERVER_SOFTWARE_VERSION: &str = "WAGI/1";
23
24pub fn build_headers(
25 route_match: &RouteMatch,
26 req: &Parts,
27 content_length: usize,
28 client_addr: SocketAddr,
29 default_host: &str,
30 use_tls: bool,
31) -> HashMap<String, String> {
32 let (host, port) = parse_host_header_uri(&req.headers, &req.uri, default_host);
33 let path_info = route_match.trailing_wildcard();
34
35 let mut headers = HashMap::new();
36
37 // CGI headers from RFC
38 headers.insert("AUTH_TYPE".to_owned(), "".to_owned()); // Not currently supported
39
40 // CONTENT_LENGTH (from the spec)
41 // The server MUST set this meta-variable if and only if the request is
42 // accompanied by a message-body entity. The CONTENT_LENGTH value must
43 // reflect the length of the message-body after the server has removed
44 // any transfer-codings or content-codings.
45 headers.insert("CONTENT_LENGTH".to_owned(), format!("{content_length}"));
46
47 // CONTENT_TYPE (from the spec)
48 // The server MUST set this meta-variable if an HTTP Content-Type field is present
49 // in the client request header. If the server receives a request with an
50 // attached entity but no Content-Type header field, it MAY attempt to determine
51 // the correct content type, otherwise it should omit this meta-variable.
52 //
53 // Right now, we don't attempt to determine a media type if none is presented.
54 //
55 // The spec seems to indicate that if CONTENT_LENGTH > 0, this may be set
56 // to "application/octet-stream" if no type is otherwise set. Not sure that is
57 // a good idea.
58 headers.insert(
59 "CONTENT_TYPE".to_owned(),
60 req.headers
61 .get("CONTENT_TYPE")
62 .map(|c| c.to_str().unwrap_or(""))
63 .unwrap_or("")
64 .to_owned(),
65 );
66
67 let protocol = if use_tls { "https" } else { "http" };
68
69 // Since this is not in the specification, an X_ is prepended, per spec.
70 // NB: It is strange that there is not a way to do this already. The Display impl
71 // seems to only provide the path.
72 let uri = req.uri.clone();
73 headers.insert(
74 "X_FULL_URL".to_owned(),
75 format!(
76 "{}://{}:{}{}",
77 protocol,
78 host,
79 port,
80 uri.path_and_query().map(|pq| pq.as_str()).unwrap_or("")
81 ),
82 );
83
84 headers.insert("GATEWAY_INTERFACE".to_owned(), WAGI_VERSION.to_owned());
85
86 // This is the Wagi route. This is different from PATH_INFO in that it may
87 // have a trailing '/...'
88 headers.insert(
89 "X_MATCHED_ROUTE".to_owned(),
90 route_match.based_route().to_string(),
91 );
92
93 headers.insert(
94 "QUERY_STRING".to_owned(),
95 req.uri.query().unwrap_or("").to_owned(),
96 );
97
98 headers.insert("REMOTE_ADDR".to_owned(), client_addr.ip().to_string());
99 headers.insert("REMOTE_HOST".to_owned(), client_addr.ip().to_string()); // The server MAY substitute it with REMOTE_ADDR
100 headers.insert("REMOTE_USER".to_owned(), "".to_owned()); // TODO: Parse this out of uri.authority?
101 headers.insert("REQUEST_METHOD".to_owned(), req.method.to_string());
102
103 // The Path component is /$SCRIPT_NAME/$PATH_INFO
104 // SCRIPT_NAME is the route that matched.
105 // https://datatracker.ietf.org/doc/html/rfc3875#section-4.1.13
106 headers.insert(
107 "SCRIPT_NAME".to_owned(),
108 route_match.based_route_or_prefix().to_owned(),
109 );
110 // PATH_INFO is any path information after SCRIPT_NAME
111 //
112 // I am intentionally ignoring the PATH_INFO rule that says that a PATH_INFO
113 // cannot have a path seperator in it. If it becomes important to distinguish
114 // between what was decoded out of the path and what is encoded in the path,
115 // the X_RAW_PATH_INFO can be used.
116 //
117 // https://datatracker.ietf.org/doc/html/rfc3875#section-4.1.5
118 let pathsegment = path_info;
119 let pathinfo = percent_encoding::percent_decode_str(&pathsegment).decode_utf8_lossy();
120 headers.insert(
121 "X_RAW_PATH_INFO".to_owned(),
122 pathsegment.as_ref().to_owned(),
123 );
124 headers.insert("PATH_INFO".to_owned(), pathinfo.to_string());
125 // PATH_TRANSLATED is the url-decoded version of PATH_INFO
126 // https://datatracker.ietf.org/doc/html/rfc3875#section-4.1.6
127 headers.insert("PATH_TRANSLATED".to_owned(), pathinfo.to_string());
128
129 // From the spec: "the server would use the contents of the request's Host header
130 // field to select the correct virtual host."
131 headers.insert("SERVER_NAME".to_owned(), host);
132 headers.insert("SERVER_PORT".to_owned(), port);
133 headers.insert("SERVER_PROTOCOL".to_owned(), format!("{:?}", req.version));
134
135 headers.insert(
136 "SERVER_SOFTWARE".to_owned(),
137 SERVER_SOFTWARE_VERSION.to_owned(),
138 );
139
140 // Normalize incoming HTTP headers. The spec says:
141 // "The HTTP header field name is converted to upper case, has all
142 // occurrences of "-" replaced with "_" and has "HTTP_" prepended to
143 // give the meta-variable name."
144 req.headers.iter().for_each(|header| {
145 let key = format!(
146 "HTTP_{}",
147 header.0.as_str().to_uppercase().replace('-', "_")
148 );
149 // Per spec 4.1.18, skip some headers
150 if key == "HTTP_AUTHORIZATION" || key == "HTTP_CONNECTION" {
151 return;
152 }
153 let val = header.1.to_str().unwrap_or("CORRUPT VALUE").to_owned();
154 headers.insert(key, val);
155 });
156
157 headers
158}
159
160/// Internal utility function for parsing a host header.
161///
162/// This attempts to use three sources to construct a definitive host/port pair, ordering
163/// by precedent.
164///
165/// - The content of the host header is considered most authoritative.
166/// - Next most authoritative is self.host, which is set at the CLI or in the config
167/// - As a last resort, we use the host/port that Hyper gives us.
168/// - If none of these provide sufficient data, which is definitely a possiblity,
169/// we go with `localhost` as host and `80` as port. This, of course, is problematic,
170/// but should only manifest if both the server and the client are behaving badly.
171fn parse_host_header_uri(
172 headers: &HeaderMap,
173 uri: &hyper::Uri,
174 default_host: &str,
175) -> (String, String) {
176 let host_header = headers.get(HOST).and_then(|v| match v.to_str() {
177 Err(_) => None,
178 Ok(s) => Some(s.to_owned()),
179 });
180
181 let mut host = uri
182 .host()
183 .map(|h| h.to_string())
184 .unwrap_or_else(|| "localhost".to_owned());
185 let mut port = uri.port_u16().unwrap_or(80).to_string();
186
187 let mut parse_host = |hdr: String| {
188 let mut parts = hdr.splitn(2, ':');
189 match parts.next() {
190 Some(h) if !h.is_empty() => h.clone_into(&mut host),
191 _ => {}
192 }
193 match parts.next() {
194 Some(p) if !p.is_empty() => {
195 tracing::debug!(port = p, "Overriding port");
196 p.clone_into(&mut port);
197 }
198 _ => {}
199 }
200 };
201
202 // Override with local host field if set.
203 if !default_host.is_empty() {
204 parse_host(default_host.to_owned());
205 }
206
207 // Finally, the value of the HOST header is considered authoritative.
208 // When it comes to port number, the HOST header isn't necessarily 100% trustworthy.
209 // But it appears that this is still the best behavior for the CGI spec.
210 if let Some(hdr) = host_header {
211 parse_host(hdr);
212 }
213
214 (host, port)
215}
216
217pub fn compose_response(stdout: &[u8]) -> Result<Response<Body>, Error> {
218 // Okay, once we get here, all the information we need to send back in the response
219 // should be written to the STDOUT buffer. We fetch that, format it, and send
220 // it back. In the process, we might need to alter the status code of the result.
221 //
222 // This is a little janky, but basically we are looping through the output once,
223 // looking for the double-newline that distinguishes the headers from the body.
224 // The headers can then be parsed separately, while the body can be sent back
225 // to the client.
226 let mut last = 0;
227 let mut scan_headers = true;
228 let mut buffer: Vec<u8> = Vec::new();
229 let mut out_headers: Vec<u8> = Vec::new();
230 stdout.iter().for_each(|i| {
231 // Ignore CR in headers
232 if scan_headers && *i == 13 {
233 return;
234 } else if scan_headers && *i == 10 && last == 10 {
235 out_headers.append(&mut buffer);
236 buffer = Vec::new();
237 scan_headers = false;
238 return; // Consume the linefeed
239 }
240 last = *i;
241 buffer.push(*i)
242 });
243 let mut res = Response::new(body::full(buffer.into()));
244 let mut sufficient_response = false;
245 let mut explicit_status_code = false;
246 for h in parse_cgi_headers(String::from_utf8(out_headers)?).iter() {
247 use hyper::header::{CONTENT_TYPE, LOCATION};
248 match h.0.to_lowercase().as_str() {
249 "content-type" => {
250 sufficient_response = true;
251 res.headers_mut().insert(CONTENT_TYPE, h.1.parse()?);
252 }
253 "status" => {
254 // The spec does not say that status is a sufficient response.
255 // (It says that it may be added along with Content-Type, because
256 // a status has a content type). However, CGI libraries in the wild
257 // do not set content type correctly if a status is an error.
258 // See https://datatracker.ietf.org/doc/html/rfc3875#section-6.2
259 sufficient_response = true;
260 explicit_status_code = true;
261 // Status can be `Status CODE [STRING]`, and we just want the CODE.
262 let status_code = h.1.split_once(' ').map(|(code, _)| code).unwrap_or(h.1);
263 tracing::debug!(status_code, "Raw status code");
264 match status_code.parse::<StatusCode>() {
265 Ok(code) => *res.status_mut() = code,
266 Err(e) => {
267 tracing::warn!("Failed to parse code: {}", e);
268 *res.status_mut() = StatusCode::BAD_GATEWAY;
269 }
270 }
271 }
272 "location" => {
273 sufficient_response = true;
274 res.headers_mut()
275 .insert(LOCATION, HeaderValue::from_str(h.1)?);
276 if !explicit_status_code {
277 *res.status_mut() = StatusCode::FOUND;
278 }
279 }
280 _ => {
281 // If the header can be parsed into a valid HTTP header, it is
282 // added to the headers. Otherwise it is ignored.
283 match HeaderName::from_lowercase(h.0.as_str().to_lowercase().as_bytes()) {
284 Ok(hdr) => {
285 res.headers_mut().insert(hdr, HeaderValue::from_str(h.1)?);
286 }
287 Err(e) => {
288 tracing::error!(error = %e, header_name = %h.0, "Invalid header name")
289 }
290 }
291 }
292 }
293 }
294 if !sufficient_response {
295 tracing::debug!("{:?}", res.body());
296 return Ok(internal_error(
297 // Technically, we let `status` be sufficient, but this is more lenient
298 // than the specification.
299 "Exactly one of 'location' or 'content-type' must be specified",
300 ));
301 }
302 Ok(res)
303}
304
305fn parse_cgi_headers(headers: String) -> HashMap<String, String> {
306 let mut map = HashMap::new();
307 headers.trim().split('\n').for_each(|h| {
308 let parts: Vec<&str> = h.splitn(2, ':').collect();
309 if parts.len() != 2 {
310 tracing::warn!(header = h, "corrupt header");
311 return;
312 }
313 map.insert(parts[0].trim().to_owned(), parts[1].trim().to_owned());
314 });
315 map
316}
317
318/// Create an HTTP 500 response
319fn internal_error(msg: impl std::string::ToString) -> Response<Body> {
320 let message = msg.to_string();
321 tracing::error!(error = %message, "HTTP 500 error");
322 let mut res = Response::new(body::full(message.into()));
323 *res.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
324 res
325}