MediaWiki:TextCleaner.js
Jump to navigation
Jump to search
Note: After publishing, you may have to bypass your browser's cache to see the changes.
- Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (⌘-R on a Mac)
- Google Chrome: Press Ctrl-Shift-R (⌘-Shift-R on a Mac)
- Internet Explorer / Edge: Hold Ctrl while clicking Refresh, or press Ctrl-F5
- Opera: Press Ctrl-F5.
/**
Wikitext sanitation for MediaWiki
Author: [[User:Lupo]], January 2008
License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)
Choose whichever license of these you like best :-)
*/
// <nowiki>
/* global mw:false, TextCleaner:true */
/* eslint no-control-regex:0, one-var:0, vars-on-top:0, camelcase:0, curly:0, space-in-parens:0, computed-property-spacing:0, array-bracket-spacing:0 */
/* jshint curly:false, eqnull:true, laxbreak:true */
( function () {
'use strict';
window.TextCleaner = {
imgNamespaceNames: null,
// This function attempts to construct well-formed wikitext from input that may contain
// possibly broken wikitext.
//
// Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence
// of templates, and due to the fact that image thumbnail captions may themselves contain
// links. This implementation catches the most common errors (such as forgetting to close a
// template or a link), and even some more elaborate ones. With enough malice, this sanitation
// can still be broken by user input such that the result is not well-formed wikitext as the
// parser at the servers would like to have it. (It's still possible that the result is broken
// wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext
// into broken wikitext.)
//
// If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original
// image link was a thumbnail or had a width smaller than 300px specified.
//
// WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is
// probably rather inefficient due to the many substrings that are generated. This function is
// primarily intended to be used to clean up user input in forms, which are typically rather
// short.
sanitizeWikiText: function ( input, only_thumbs ) {
if ( input.search( /[\][}{]|<nowiki(\s[^>]*)?>|<!--/ ) < 0 ) {
// No critical characters
return input;
}
if ( !TextCleaner.imgNamespaceNames ) {
TextCleaner.imgNamespaceNames = [];
var namespaceIds = mw.config.get( 'wgNamespaceIds' );
if ( namespaceIds ) {
for ( var name in namespaceIds ) {
if ( namespaceIds[name] === 6 ) { // Image namespace
TextCleaner.imgNamespaceNames.push( name );
}
}
}
// Make sure that we have the two canonical names
TextCleaner.imgNamespaceNames.push( 'Image' );
TextCleaner.imgNamespaceNames.push( 'File' );
// If your wiki does not have wgNamespaceIds, add aliases or localized namespace names here!
}
var consumed = [ 0, 0 ];
// For image captions. Image caption may contain links, and may even contain images.
// The current MediaWiki parser actually allows this only once. For deeper recursions,
// it fails. But here, it's actually easier to implement no limit.
var base_regexp = new RegExp(
'[\\x01\\x02\\x03\\x04[\\]\\|\\x05\\x06\\x07\\x08]' +
'|<nowiki(\\s[^>]*)?>|<!--',
'i' // Ignore case
);
var nowiki_regexp = new RegExp( '<\\/nowiki(\\s[^>]*)?>|<!--', 'i' );
var allow_only_thumbs = only_thumbs;
function sanitize( s, with_links, caption_level, allow_thumbs, break_at_pipe, with_tables, with_galleries ) {
if ( !s || !s.length ) {
if ( caption_level > 0 ) {
if ( consumed.length < caption_level ) {
consumed.push( 0 );
} else {
consumed[caption_level - 1] = 0;
}
}
return s;
}
var result = '';
var ch = '';
var initial_length = s.length;
var get_out = false;
var in_nowiki = false;
var endings = null;
// Stack recording template and table nesting
var next;
var regexp = base_regexp;
function push_end( val ) {
if ( !endings ) {
endings = [ val ];
} else {
endings.push( val );
}
}
function pop_end() {
if ( !endings ) {
// Shouldn't happen
return null;
}
var result;
if ( endings.length === 1 ) {
result = endings[0];
endings = null;
} else {
result = endings[endings.length - 1];
endings.length--;
}
return result;
}
function get_initial( i, s ) {
for ( var j = 0; j < TextCleaner.imgNamespaceNames.length; j++ ) {
if ( s.length >= i + TextCleaner.imgNamespaceNames[j].length + 1 ) {
var t = s.substr( i, TextCleaner.imgNamespaceNames[j].length + 1 );
if ( t.toLowerCase() === ( TextCleaner.imgNamespaceNames[j].toLowerCase() + ':' ) ) {
return t;
}
}
}
return null;
}
while ( s.length > 0 && !get_out ) {
next = s.search( regexp );
if ( next < 0 ) {
result += s;
break;
}
ch = s.charAt( next );
var i = -1;
var j = -1;
var k = -1;
switch ( ch ) {
case '<':
// Nowiki or HTML comment. Must be closed.
if ( s.charAt( next + 1 ) === '!' ) {
// HTML comment. Cannot be nested.
i = s.indexOf( '-->', next + 3 );
if ( i < 0 ) {
result += s + '-->';
s = '';
} else {
result += s.substring( 0, i + 3 );
s = s.substring( i + 3 );
}
} else if ( s.charAt( next + 1 ) === 'n' ) {
// Nowiki may contain HTML comments!
in_nowiki = true;
regexp = nowiki_regexp;
result += s.substring( 0, next + 7 );
s = s.substring( next + 7 );
} else {
// End of nowiki. Searched for and found only if in_nowiki === true
in_nowiki = false;
regexp = base_regexp;
i = s.indexOf( '>', next + 1 ); // End of tag
result += s.substring( 0, i + 1 );
s = s.substring( i + 1 );
}
break;
case '\x05':
// Table start
if ( !with_tables ) {
result += s.substring( 0, next );
get_out = true;
break;
}
/* fall through */
case '\x07':
if ( ch === '\x07' && !with_galleries ) {
result += s.substring( 0, next );
get_out = true;
break;
}
/* fall through */
case '\x01':
// Start of template, table, or gallery
result += s.substring( 0, next + 1 );
push_end( String.fromCharCode( ch.charCodeAt( 0 ) + 1 ).charAt( 0 ) );
s = s.substring( next + 1 );
break;
case '\x06':
// Table end
if ( break_at_pipe && !endings ) {
result += s.substring( 0, next );
get_out = true;
break;
}
/* fall through */
case '\x02':
// End of a template or table
result += s.substring( 0, next );
if ( !endings || endings[endings.length - 1] !== ch ) {
// Spurious template or table end
if ( ch === '\x02' ) {
result += '}}';
} else {
result += '|}';
}
} else {
result += pop_end();
}
s = s.substring( next + 1 );
break;
case '\x08':
// End of gallery
result += s.substring( 0, next + 1 );
if ( endings && endings[endings.length - 1] === ch ) {
pop_end();
}
s = s.substring( next + 1 );
break;
case '\x03':
case '[': {
if ( !with_links && !endings ) {
get_out = true;
break;
}
// Image links must be treated specially, since they may contain nested links
// in the caption!
var initial = null; // If set, it's 'image:' or 'file:' and we have an image link
i = next;
while ( i < s.length && s.charAt( i ) === ch ) {
i++;
}
if ( ch === '\x03' && i < s.length && s.charAt( i ) === '[' ) {
i++;
}
initial = get_initial( i, s );
// Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |}
var lk_text = sanitize(
s.substring( i ),
false, // No links at top-level allowed
caption_level + 1,
false, // No thumbs
true, // Break at pipe
false, // No tables
false // No galleries
);
var lk_text_length = consumed[caption_level];
j = i + lk_text_length;
if ( j >= s.length ) {
// Used up the whole text: [[Foo or [bar
if ( initial && allow_only_thumbs ) {
// Should in any case have started with [[, not [
result += s.substring( 0, i - 1 ) + '\x03:' + initial +
lk_text.substring( initial.length ) + '\x04';
} else {
result += s.substring( 0, i ) + lk_text +
( ( s.charAt( i - 1 ) === '[' ) ? ']' : '\x04');
}
s = '';
break;
}
if ( s.charAt( j ) === '|' ) {
k = j;
} else {
k = -1;
}
if ( k < 0 ) {
// No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar]
if ( initial && allow_only_thumbs ) {
// Should in any case have started with [[, not [
result += s.substring( 0, i - 1 ) + '\x03:' + initial +
lk_text.substring( initial.length ) + '\x04';
} else {
result += s.substring( 0, i ) + lk_text +
( ( s.charAt( i - 1 ) === '[') ? ']' : '\x04' );
}
if ( s.charAt( j ) === ']' || s.charAt( j ) === '\x04' ) {
// Indeed closing the link
s = s.substring( j + 1 );
} else {
s = s.substring( j );
}
break;
} else {
var caption = null;
var used = 0;
// Pipe found.
if ( !initial ) {
// Not an image link. Must be something like [[Foo|Bar]].
caption = sanitize(
s.substring( k + 1 ),
// No links, please
false,
caption_level + 1,
// No thumbs either
false,
// Don't care about pipes
false,
// Allow tables (yes, parser allows that!)
true,
// Allow galleries (?)
true
);
// Now we're at [[, [, ]], or ]
used = consumed[caption_level];
result += s.substring( 0, i ) + lk_text + '|' + caption +
( ( s.charAt( i - 1 ) === '[') ? ']' : '\x04' );
} else {
var q = s.substring( k );
// We assume that there are no templates, nowikis, and other nasty things
// in the parameters. Search forward until the next [, {, ], }
var l = q.search( /[\x01\x02\x03[\x04\]{}\x05\x06\x07\x08]/ );
if ( l < 0 ) {
l = q.length;
}
if ( l + 1 < q.length ) {
q = q.substring( 0, l + 1 );
}
var is_thumb = q.search( /\|\s*thumb(nail)?\s*[|\x04]/ ) >= 0;
var img_width = /\|\s*(\d+)px\s*[|\x04]/.exec( q );
if ( img_width && img_width.length > 1 ) {
img_width = parseInt( img_width[1], 10 );
if ( isNaN( img_width ) ) {
img_width = null;
}
} else {
img_width = null;
}
if ( !img_width ) {
img_width = is_thumb ? 180 : 301;
}
var is_small = img_width <= 300;
// Caption starts at the last pipe before l. If that is a parameter,
// it doesn't hurt.
var m = k + q.lastIndexOf( '|', l );
caption = sanitize(
s.substring( m + 1 ),
// Allow links only if it's a thumb
is_thumb,
caption_level + 1,
allow_thumbs && is_thumb,
// Don't break at pipe
false,
// Tables only if it's a thumb
is_thumb,
// Allow galleries for thumbs (?)
is_thumb
);
used = consumed[caption_level];
// caption used 'used' chars from m+1, s.charAt(m+1+used) === '\x04'
is_thumb = allow_thumbs && is_small;
if ( is_thumb || !allow_only_thumbs ) {
result += s.substring( 0, i - 1 ) + '\x03' + lk_text;
} else {
result += s.substring( 0, i - 1 ) + '\x03:' + initial +
lk_text.substring( initial.length);
}
result += s.substring( k, m + 1 ) + caption + '\x04';
k = m;
}
next = k + 1 + used;
if ( next < s.length ) {
if ( s.charAt( next ) !== '\x04' ) {
s = s.substring( next );
} else {
s = s.substring( next + 1 );
}
} else {
s = '';
}
}
break;
}
case '\x04':
case ']':
// Extra bracket.
result += s.substring( 0, next );
if ( !caption_level && !break_at_pipe ) {
result += ( ch === ']' ? ']' : ']]' );
s = s.substring( next + 1 );
} else {
get_out = true;
}
break;
case '|':
result += s.substring( 0, next );
if ( break_at_pipe && !endings ) {
// Pipe character at top level
get_out = true;
} else {
if ( !caption_level && !break_at_pipe && !endings ) {
result += '|'; // Top-level pipe character
} else {
result += '|';
}
s = s.substring( next + 1 );
}
break;
} // end switch
} // end while
if ( in_nowiki ) {
result += '</nowiki>'; // Make sure this nowiki is closed.
}
// Close open templates and tables
while ( endings ) {
ch = pop_end();
result += ( ch === '\x06' ? '\n' : '' ) + ch;
}
if ( caption_level > 0 ) {
var used_up = initial_length - ( get_out ? ( s.length - next ) : 0 );
if ( consumed.length < caption_level ) {
consumed.push( used_up );
} else {
consumed[caption_level - 1] = used_up;
}
}
return result;
}
// Replace multi-character tokens by one-character placeholders, simplifying the
// subsequent processing.
var s = input.replace( /\{\{/g, '\x01' )
.replace( /\n\s*\|\}\}\}/g, '\n\x06\x02' ) // Table end + template end
.replace( /\}\}/g, '\x02' )
.replace( /\[\[/g, '\x03' )
.replace( /\]\]/g, '\x04' )
.replace( /\n\s*\{\|/g, '\n\x05' ) // Table start and end must be on own line
.replace( /^\s*\{\|/, '\x05' ) // Table start at the very beginning
.replace( /\n\s*\|\}/g, '\n\x06' ) // (we strip leading whitespace)
.replace( /<\s*gallery\s*>/g, '\x07' )
.replace( /<\/\s*gallery\s*>/g, '\x08' );
s = sanitize( s, true, 0, true, false, true, true );
// with links, allow thumbs, don't break at pipe, allow tables, allow galleries
return s.replace( /\x01/g, '{{' )
.replace( /\x02/g, '}}' )
.replace( /\x03/g, '[[' )
.replace( /\x04/g, ']]' )
.replace( /\x05/g, '{|' )
.replace( /\x06/g, '|}' )
.replace( /\x07/g, '<gallery>' )
.replace( /\x08/g, '</gallery>' );
}
};
}() );
// </nowiki>