MediaWiki:TextCleaner.js

Note: After publishing, you may have to bypass your browser's cache to see the changes.
Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (⌘-R on a Mac)
Google Chrome: Press Ctrl-Shift-R (⌘-Shift-R on a Mac)
Internet Explorer / Edge: Hold Ctrl while clicking Refresh, or press Ctrl-F5
Opera: Press Ctrl-F5.
/**
Wikitext sanitation for MediaWiki

Author: [[User:Lupo]], January 2008
License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)

Choose whichever license of these you like best :-)
*/
// <nowiki>
/* global mw:false, TextCleaner:true */
/* eslint no-control-regex:0, one-var:0, vars-on-top:0, camelcase:0, curly:0, space-in-parens:0, computed-property-spacing:0, array-bracket-spacing:0 */
/* jshint curly:false, eqnull:true, laxbreak:true */
( function () {
	'use strict';
	window.TextCleaner = {
		imgNamespaceNames: null,

		// This function attempts to construct well-formed wikitext from input that may contain
		// possibly broken wikitext.
		//
		// Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence
		// of templates, and due to the fact that image thumbnail captions may themselves contain
		// links. This implementation catches the most common errors (such as forgetting to close a
		// template or a link), and even some more elaborate ones. With enough malice, this sanitation
		// can still be broken by user input such that the result is not well-formed wikitext as the
		// parser at the servers would like to have it. (It's still possible that the result is broken
		// wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext
		// into broken wikitext.)
		//
		// If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original
		// image link was a thumbnail or had a width smaller than 300px specified.
		//
		// WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is
		// probably rather inefficient due to the many substrings that are generated. This function is
		// primarily intended to be used to clean up user input in forms, which are typically rather
		// short.
		sanitizeWikiText: function ( input, only_thumbs ) {
			if ( input.search( /[\][}{]|<nowiki(\s[^>]*)?>|<!--/ ) < 0 ) {
				// No critical characters
				return input;
			}

			if ( !TextCleaner.imgNamespaceNames ) {
				TextCleaner.imgNamespaceNames = [];

				var namespaceIds = mw.config.get( 'wgNamespaceIds' );
				if ( namespaceIds ) {
					for ( var name in namespaceIds ) {
						if ( namespaceIds[name] === 6 ) { // Image namespace
							TextCleaner.imgNamespaceNames.push( name );
						}
					}
				}

				// Make sure that we have the two canonical names
				TextCleaner.imgNamespaceNames.push( 'Image' );
				TextCleaner.imgNamespaceNames.push( 'File' );
				// If your wiki does not have wgNamespaceIds, add aliases or localized namespace names here!
			}

			var consumed = [ 0, 0 ];

			// For image captions. Image caption may contain links, and may even contain images.
			// The current MediaWiki parser actually allows this only once. For deeper recursions,
			// it fails. But here, it's actually easier to implement no limit.
			var base_regexp = new RegExp(
				'[\\x01\\x02\\x03\\x04[\\]\\|\\x05\\x06\\x07\\x08]' +
					'|<nowiki(\\s[^>]*)?>|<!--',
				'i' // Ignore case
			);
			var nowiki_regexp = new RegExp( '<\\/nowiki(\\s[^>]*)?>|<!--', 'i' );
			var allow_only_thumbs = only_thumbs;

			function sanitize( s, with_links, caption_level, allow_thumbs, break_at_pipe, with_tables, with_galleries ) {
				if ( !s || !s.length ) {
					if ( caption_level > 0 ) {
						if ( consumed.length < caption_level ) {
							consumed.push( 0 );
						} else {
							consumed[caption_level - 1] = 0;
						}
					}

					return s;
				}

				var result = '';
				var ch = '';
				var initial_length = s.length;
				var get_out = false;
				var in_nowiki = false;
				var endings = null;

				// Stack recording template and table nesting
				var next;
				var regexp = base_regexp;

				function push_end( val ) {
					if ( !endings ) {
						endings = [ val ];
					} else {
						endings.push( val );
					}
				}

				function pop_end() {
					if ( !endings ) {
						// Shouldn't happen
						return null;
					}

					var result;
					if ( endings.length === 1 ) {
						result = endings[0];
						endings = null;
					} else {
						result = endings[endings.length - 1];
						endings.length--;
					}

					return result;
				}

				function get_initial( i, s ) {
					for ( var j = 0; j < TextCleaner.imgNamespaceNames.length; j++ ) {
						if ( s.length >= i + TextCleaner.imgNamespaceNames[j].length + 1 ) {
							var t = s.substr( i, TextCleaner.imgNamespaceNames[j].length + 1 );
							if ( t.toLowerCase() === ( TextCleaner.imgNamespaceNames[j].toLowerCase() + ':' ) ) {
								return t;
							}
						}
					}

					return null;
				}

				while ( s.length > 0 && !get_out ) {
					next = s.search( regexp );
					if ( next < 0 ) {
						result += s;
						break;
					}

					ch = s.charAt( next );
					var i = -1;
					var j = -1;
					var k = -1;

					switch ( ch ) {
						case '<':
							// Nowiki or HTML comment. Must be closed.
							if ( s.charAt( next + 1 ) === '!' ) {
								// HTML comment. Cannot be nested.
								i = s.indexOf( '-->', next + 3 );
								if ( i < 0 ) {
									result += s + '-->';
									s = '';
								} else {
									result += s.substring( 0, i + 3 );
									s = s.substring( i + 3 );
								}
							} else if ( s.charAt( next + 1 ) === 'n' ) {
								// Nowiki may contain HTML comments!
								in_nowiki = true;
								regexp = nowiki_regexp;
								result += s.substring( 0, next + 7 );
								s = s.substring( next + 7 );
							} else {
								// End of nowiki. Searched for and found only if in_nowiki === true
								in_nowiki = false;
								regexp = base_regexp;
								i = s.indexOf( '>', next + 1 ); // End of tag
								result += s.substring( 0, i + 1 );
								s = s.substring( i + 1 );
							}
							break;
						case '\x05':
							// Table start
							if ( !with_tables ) {
								result += s.substring( 0, next );
								get_out = true;
								break;
							}
						/* fall through */
						case '\x07':
							if ( ch === '\x07' && !with_galleries ) {
								result += s.substring( 0, next );
								get_out = true;
								break;
							}
						/* fall through */
						case '\x01':
							// Start of template, table, or gallery
							result += s.substring( 0, next + 1 );
							push_end( String.fromCharCode( ch.charCodeAt( 0 ) + 1 ).charAt( 0 ) );
							s = s.substring( next + 1 );
							break;
						case '\x06':
							// Table end
							if ( break_at_pipe && !endings ) {
								result += s.substring( 0, next );
								get_out = true;
								break;
							}
						/* fall through */
						case '\x02':
							// End of a template or table
							result += s.substring( 0, next );
							if ( !endings || endings[endings.length - 1] !== ch ) {
								// Spurious template or table end
								if ( ch === '\x02' ) {
									result += '&#x7D;&#x7D;';
								} else {
									result += '&#x7C;&#x7D;';
								}
							} else {
								result += pop_end();
							}
							s = s.substring( next + 1 );
							break;
						case '\x08':
							// End of gallery
							result += s.substring( 0, next + 1 );
							if ( endings && endings[endings.length - 1] === ch ) {
								pop_end();
							}
							s = s.substring( next + 1 );
							break;
						case '\x03':
						case '[': {
							if ( !with_links && !endings ) {
								get_out = true;
								break;
							}

							// Image links must be treated specially, since they may contain nested links
							// in the caption!
							var initial = null; // If set, it's 'image:' or 'file:' and we have an image link
							i = next;

							while ( i < s.length && s.charAt( i ) === ch ) {
								i++;
							}

							if ( ch === '\x03' && i < s.length && s.charAt( i ) === '[' ) {
								i++;
							}

							initial = get_initial( i, s );

							// Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |}
							var lk_text = sanitize(
								s.substring( i ),
								false, // No links at top-level allowed
								caption_level + 1,
								false, // No thumbs
								true, // Break at pipe
								false, // No tables
								false // No galleries
							);

							var lk_text_length = consumed[caption_level];
							j = i + lk_text_length;

							if ( j >= s.length ) {
								// Used up the whole text: [[Foo or [bar
								if ( initial && allow_only_thumbs ) {
									// Should in any case have started with [[, not [
									result += s.substring( 0, i - 1 ) + '\x03:' + initial +
										lk_text.substring( initial.length ) + '\x04';
								} else {
									result += s.substring( 0, i ) + lk_text +
										( ( s.charAt( i - 1 ) === '[' ) ? ']' : '\x04');
								}

								s = '';
								break;
							}

							if ( s.charAt( j ) === '|' ) {
								k = j;
							} else {
								k = -1;
							}

							if ( k < 0 ) {
								// No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar]
								if ( initial && allow_only_thumbs ) {
									// Should in any case have started with [[, not [
									result += s.substring( 0, i - 1 ) + '\x03:' + initial +
										lk_text.substring( initial.length ) + '\x04';
								} else {
									result += s.substring( 0, i ) + lk_text +
										( ( s.charAt( i - 1 ) === '[') ? ']' : '\x04' );
								}

								if ( s.charAt( j ) === ']' || s.charAt( j ) === '\x04' ) {
									// Indeed closing the link
									s = s.substring( j + 1 );
								} else {
									s = s.substring( j );
								}

								break;
							} else {
								var caption = null;
								var used = 0;

								// Pipe found.
								if ( !initial ) {
									// Not an image link. Must be something like [[Foo|Bar]].
									caption = sanitize(
										s.substring( k + 1 ),
										// No links, please
										false,
										caption_level + 1,
										// No thumbs either
										false,
										// Don't care about pipes
										false,
										// Allow tables (yes, parser allows that!)
										true,
										// Allow galleries (?)
										true
									);

									// Now we're at [[, [, ]], or ]
									used = consumed[caption_level];

									result += s.substring( 0, i ) + lk_text + '|' + caption +
										( ( s.charAt( i - 1 ) === '[') ? ']' : '\x04' );
								} else {
									var q = s.substring( k );

									// We assume that there are no templates, nowikis, and other nasty things
									// in the parameters. Search forward until the next [, {, ], }
									var l = q.search( /[\x01\x02\x03[\x04\]{}\x05\x06\x07\x08]/ );
									if ( l < 0 ) {
										l = q.length;
									}
									if ( l + 1 < q.length ) {
										q = q.substring( 0, l + 1 );
									}

									var is_thumb = q.search( /\|\s*thumb(nail)?\s*[|\x04]/ ) >= 0;
									var img_width = /\|\s*(\d+)px\s*[|\x04]/.exec( q );
									if ( img_width && img_width.length > 1 ) {
										img_width = parseInt( img_width[1], 10 );
										if ( isNaN( img_width ) ) {
											img_width = null;
										}
									} else {
										img_width = null;
									}
									if ( !img_width ) {
										img_width = is_thumb ? 180 : 301;
									}
									var is_small = img_width <= 300;

									// Caption starts at the last pipe before l. If that is a parameter,
									// it doesn't hurt.
									var m = k + q.lastIndexOf( '|', l );
									caption = sanitize(
										s.substring( m + 1 ),
										// Allow links only if it's a thumb
										is_thumb,
										caption_level + 1,
										allow_thumbs && is_thumb,
										// Don't break at pipe
										false,
										// Tables only if it's a thumb
										is_thumb,
										// Allow galleries for thumbs (?)
										is_thumb
									);
									used = consumed[caption_level];

									// caption used 'used' chars from m+1, s.charAt(m+1+used) === '\x04'
									is_thumb = allow_thumbs && is_small;
									if ( is_thumb || !allow_only_thumbs ) {
										result += s.substring( 0, i - 1 ) + '\x03' + lk_text;
									} else {
										result += s.substring( 0, i - 1 ) + '\x03:' + initial +
											lk_text.substring( initial.length);
									}

									result += s.substring( k, m + 1 ) + caption + '\x04';
									k = m;
								}

								next = k + 1 + used;
								if ( next < s.length ) {
									if ( s.charAt( next ) !== '\x04' ) {
										s = s.substring( next );
									} else {
										s = s.substring( next + 1 );
									}
								} else {
									s = '';
								}
							}

							break;
						}
						case '\x04':
						case ']':
							// Extra bracket.
							result += s.substring( 0, next );
							if ( !caption_level && !break_at_pipe ) {
								result += ( ch === ']' ? '&#x5D;' : '&#x5D;&#x5D;' );
								s = s.substring( next + 1 );
							} else {
								get_out = true;
							}
							break;
						case '|':
							result += s.substring( 0, next );
							if ( break_at_pipe && !endings ) {
								// Pipe character at top level
								get_out = true;
							} else {
								if ( !caption_level && !break_at_pipe && !endings ) {
									result += '&#x7C;'; // Top-level pipe character
								} else {
									result += '|';
								}

								s = s.substring( next + 1 );
							}
							break;
					} // end switch
				} // end while

				if ( in_nowiki ) {
					result += '</nowiki>'; // Make sure this nowiki is closed.
				}

				// Close open templates and tables
				while ( endings ) {
					ch = pop_end();
					result += ( ch === '\x06' ? '\n' : '' ) + ch;
				}

				if ( caption_level > 0 ) {
					var used_up = initial_length - ( get_out ? ( s.length - next ) : 0 );
					if ( consumed.length < caption_level ) {
						consumed.push( used_up );
					} else {
						consumed[caption_level - 1] = used_up;
					}
				}

				return result;
			}

			// Replace multi-character tokens by one-character placeholders, simplifying the
			// subsequent processing.
			var s = input.replace( /\{\{/g, '\x01' )
				.replace( /\n\s*\|\}\}\}/g, '\n\x06\x02' ) // Table end + template end
				.replace( /\}\}/g, '\x02' )
				.replace( /\[\[/g, '\x03' )
				.replace( /\]\]/g, '\x04' )
				.replace( /\n\s*\{\|/g, '\n\x05' ) // Table start and end must be on own line
				.replace( /^\s*\{\|/, '\x05' ) // Table start at the very beginning
				.replace( /\n\s*\|\}/g, '\n\x06' ) // (we strip leading whitespace)
				.replace( /<\s*gallery\s*>/g, '\x07' )
				.replace( /<\/\s*gallery\s*>/g, '\x08' );

			s = sanitize( s, true, 0, true, false, true, true );

			// with links, allow thumbs, don't break at pipe, allow tables, allow galleries
			return s.replace( /\x01/g, '{{' )
				.replace( /\x02/g, '}}' )
				.replace( /\x03/g, '[[' )
				.replace( /\x04/g, ']]' )
				.replace( /\x05/g, '{|' )
				.replace( /\x06/g, '|}' )
				.replace( /\x07/g, '<gallery>' )
				.replace( /\x08/g, '</gallery>' );
		}
	};
}() );
// </nowiki>
MediaWiki:TextCleaner.js

Navigation menu

Search