PHP escapeshellarg with unicode/utf-8 support

By default escapeshellarg will strip any unicode characters. You can in some cases solve this by setting the locale to a utf-8 variant, but that might not always work.

Another way to do this is to write a custom escapeshellarg function:

function mb_escapeshellarg($arg)
{
	if (strtoupper(substr(PHP_OS, 0, 3)) === 'WIN') {
		return '"' . str_replace(array('"', '%'), array('', ''), $arg) . '"';
	} else {
		return "'" . str_replace("'", "'\\''", $arg) . "'";
	}
}

The code above is translated from the C source of PHP.

Below is the C code for this function from the file ext/standard/exec.c, if you want to check. Copyright held by respective authors according to the license.

/* {{{ php_escape_shell_arg
 */
PHPAPI char *php_escape_shell_arg(char *str)
{
	int x, y = 0, l = strlen(str);
	char *cmd;
	size_t estimate = (4 * l) + 3;

	TSRMLS_FETCH();

	cmd = safe_emalloc(4, l, 3); /* worst case */

#ifdef PHP_WIN32
	cmd[y++] = '"';
#else
	cmd[y++] = '\'';
#endif

	for (x = 0; x < l; x++) {
		int mb_len = php_mblen(str + x, (l - x));

		/* skip non-valid multibyte characters */
		if (mb_len < 0) {
			continue;
		} else if (mb_len > 1) {
			memcpy(cmd + y, str + x, mb_len);
			y += mb_len;
			x += mb_len - 1;
			continue;
		}

		switch (str[x]) {
#ifdef PHP_WIN32
		case '"':
		case '%':
			cmd[y++] = ' ';
			break;
#else
		case '\'':
			cmd[y++] = '\'';
			cmd[y++] = '\\';
			cmd[y++] = '\'';
#endif
			/* fall-through */
		default:
			cmd[y++] = str[x];
		}
	}
#ifdef PHP_WIN32
	cmd[y++] = '"';
#else
	cmd[y++] = '\'';
#endif
	cmd[y] = '\0';

	if ((estimate - y) > 4096) {
		/* realloc if the estimate was way overill
		 * Arbitrary cutoff point of 4096 */
		cmd = erealloc(cmd, y + 1);
	}
	return cmd;
}
/* }}} */
4 January 2012