MediaWiki:TextCleaner.js

/** Wikitext sanitation for MediaWiki

Author: User:Lupo, January 2008 License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)

Choose whichever license of these you like best :-) // /* global mw:false, TextCleaner:true */ /* eslint no-control-regex:0, one-var:0, vars-on-top:0, camelcase:0, curly:0, space-in-parens:0, computed-property-spacing:0, array-bracket-spacing:0 */ /* jshint curly:false, eqnull:true, laxbreak:true */ (function { 'use strict'; window.TextCleaner = {

imgNamespaceNames: null,

// This function attempts to construct well-formed wikitext from input that may contain // possibly broken wikitext. // // Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence // of templates, and due to the fact that image thumbnail captions may themselves contain // links. This implementation catches the most common errors (such as forgetting to close a // template or a link), and even some more elaborate ones. With enough malice, this sanitation // can still be broken by user input such that the result is not well-formed wikitext as the // parser at the servers would like to have it. (It's still possible that the result is broken // wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext // into broken wikitext.) // // If 'only_thumbs' is true, all Image: links are changed to :Image:, unless the original // image link was a thumbnail or had a width smaller than 300px specified. // // WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is // probably rather inefficient due to the many substrings that are generated. This function is // primarily intended to be used to clean up user input in forms, which are typically rather // short. sanitizeWikiText: function (input, only_thumbs) { if (input.search(/[\][}{]|]*)?>|', next + 3);							if (i < 0) {								result += s + '-->';								s = '';							} else {								result += s.substring(0, i + 3);								s = s.substring(i + 3);							}						} else if (s.charAt(next + 1) === 'n') {					// Nowiki may contain HTML comments!							in_nowiki = true;							regexp = nowiki_regexp;							result += s.substring(0, next + 7);							s = s.substring(next + 7);						} else {					// End of nowiki. Searched for and found only if in_nowiki === true							in_nowiki = false;							regexp = base_regexp;							i = s.indexOf('>', next + 1); // End of tag							result += s.substring(0, i + 1);							s = s.substring(i + 1);						}						break;					case '\x05':				// Table start						if (!with_tables) {							result += s.substring(0, next);							get_out = true;							break;						}				/* fall through */					case '\x07':						if (ch === '\x07' && !with_galleries) { result += s.substring(0, next); get_out = true; break; }				/* fall through */ case '\x01': // Start of template, table, or gallery result += s.substring(0, next + 1); push_end(String.fromCharCode(ch.charCodeAt(0) + 1).charAt(0)); s = s.substring(next + 1); break; case '\x06': // Table end if (break_at_pipe && !endings) { result += s.substring(0, next); get_out = true; break; }				/* fall through */ case '\x02': // End of a template or table result += s.substring(0, next); if (!endings || endings[endings.length - 1] !== ch) { // Spurious template or table end if (ch === '\x02') result += '&#x7D;&#x7D;'; else result += '&#x7C;&#x7D;'; } else { result += pop_end; }						s = s.substring(next + 1); break; case '\x08': // End of gallery result += s.substring(0, next + 1); if (endings && endings[endings.length - 1] === ch) pop_end; s = s.substring(next + 1); break; case '\x03': case '[': { if (!with_links && !endings) { get_out = true; break; }					// Image links must be treated specially, since they may contain nested links // in the caption! var initial = null; // If set, it's 'image:' or 'file:' and we have an image link i = next; while (i < s.length && s.charAt(i) === ch) i++; if (ch === '\x03' && i < s.length && s.charAt(i) === '[') i++; initial = get_initial(i, s);

// Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |} var lk_text = sanitize(s.substring(i),							false, // No links at top-level allowed							caption_level + 1,							false, // No thumbs							true, // Break at pipe							false, // No tables							false); // No galleries var lk_text_length = consumed[caption_level]; j = i + lk_text_length; if (j >= s.length) { // Used up the whole text: [[Foo or [bar							if (initial && allow_only_thumbs) {							// Should in any case have started with [[, not [

result += s.substring(0, i - 1) + '\x03:' + initial + lk_text.substring(initial.length) + '\x04'; } else { result += s.substring(0, i) + lk_text + ((s.charAt(i - 1) === '[') ? ']' : '\x04'); }							s = ''; break; }						if (s.charAt(j) === '|') k = j;						else k = -1; if (k < 0) { // No pipe found: we should be on the closing ]] or ] or Foo or [bar] if (initial && allow_only_thumbs) { // Should in any case have started with | s.charAt(j) === '\x04') {							// Indeed closing the link								s = s.substring(j + 1);							} else {								s = s.substring(j);							}							break;						} else {							var caption = null;							var used = 0;						// Pipe found.							if (!initial) {							// Not an image link. Must be something like [[Foo|Bar.								caption = sanitize(									s.substring(k + 1),									// No links, please									false,									caption_level + 1,									// No thumbs either									false,									// Don't care about pipes									false,									// Allow tables (yes, parser allows that!)									true,									// Allow galleries (?)									true);							// Now we're at, [, , or ]								used = consumed[caption_level];								result += s.substring(0, i) + lk_text + '|' + caption +							((s.charAt(i - 1) === '[') ? ']' : '\x04');							} else {								var q = s.substring(k);							// We assume that there are no templates, nowikis, and other nasty things							// in the parameters. Search forward until the next [, {, ], }								var l = q.search(/[\x01\x02\x03[\x04\]{}\x05\x06\x07\x08]/);								if (l < 0)									l = q.length;								if (l + 1 < q.length)									q = q.substring(0, l + 1);								var is_thumb = q.search(/\|\s*thumb(nail)?\s*[|\x04]/) >= 0;								var img_width = /\|\s*(\d+)px\s*[|\x04]/.exec(q);								if (img_width && img_width.length > 1) {									img_width = parseInt(img_width[1], 10);									if (isNaN(img_width))										img_width = null;								} else {									img_width = null;								}								if (!img_width)									img_width = is_thumb ? 180 : 301;								var is_small = img_width <= 300;

// Caption starts at the last pipe before l. If that is a parameter, // it doesn't hurt. var m = k + q.lastIndexOf('|', l); caption = sanitize(									s.substring(m + 1),									// Allow links only if it's a thumb									is_thumb,									caption_level + 1,									allow_thumbs && is_thumb,									// Don't break at pipe									false,									// Tables only if it's a thumb									is_thumb,									// Allow galleries for thumbs (?)									is_thumb); used = consumed[caption_level]; // caption used 'used' chars from m+1, s.charAt(m+1+used) === '\x04' is_thumb = allow_thumbs && is_small; if (is_thumb || !allow_only_thumbs) { result += s.substring(0, i - 1) + '\x03' + lk_text; } else { result += s.substring(0, i - 1) + '\x03:' + initial + lk_text.substring(initial.length); }								result += s.substring(k, m + 1) + caption + '\x04'; k = m;							} next = k + 1 + used; if (next < s.length) { if (s.charAt(next) !== '\x04') s = s.substring(next); else s = s.substring(next + 1); } else { s = ''; }						}						break; }					case '\x04': case ']': // Extra bracket. result += s.substring(0, next); if (!caption_level && !break_at_pipe) { result += (ch === ']' ? '&#x5D;' : '&#x5D;&#x5D;'); s = s.substring(next + 1); } else { get_out = true; }						break; case '|': result += s.substring(0, next); if (break_at_pipe && !endings) { // Pipe character at top level get_out = true; } else { if (!caption_level && !break_at_pipe && !endings) result += '&#x7C;'; // Top-level pipe character else result += '|'; s = s.substring(next + 1); }						break; } // end switch } // end while if (in_nowiki) result += ' '; // Make sure this nowiki is closed. // Close open templates and tables while (endings) { ch = pop_end; result += (ch === '\x06' ? '\n' : '') + ch; }			if (caption_level > 0) { var used_up = initial_length - (get_out ? (s.length - next) : 0); if (consumed.length < caption_level) consumed.push(used_up); else consumed[caption_level - 1] = used_up; }			return result; }

// Replace multi-character tokens by one-character placeholders, simplifying the // subsequent processing. var s = input.replace(/\{\{/g, '\x01') .replace(/\n\s*\|\}\}\}/g, '\n\x06\x02') // Table end + template end .replace(/\}\}/g, '\x02') .replace(/\[\[/g, '\x03') .replace(/\]\]/g, '\x04') .replace(/\n\s*\{\|/g, '\n\x05') // Table start and end must be on own line .replace(/^\s*\{\|/, '\x05') // Table start at the very beginning .replace(/\n\s*\|\}/g, '\n\x06') // (we strip leading whitespace) .replace(/<\s*gallery\s*>/g, '\x07') .replace(/<\/\s*gallery\s*>/g, '\x08');

s = sanitize(s, true, 0, true, false, true, true); // with links, allow thumbs, don't break at pipe, allow tables, allow galleries return s.replace(/\x01/g, '') .replace(/\x03/g, )			.replace(/\x04/g, ) .replace(/\x05/g, '{|') .replace(/\x06/g, '|}') .replace(/\x07/g, ''); } }; }); //