Remove HTML formatting from SharePoint Rich Text Editor content with JavaScript

Compared with Microsoft Word, SharePoint’s out of the box Rich Text editor seems like a tool from the dark ages. These days, anyone can use Microsoft Word to format content better than the US Mint could 20 years ago, so when it comes to publishing content in SharePoint, most of us follow the path of least resistance and just paste Word content into the SharePoint editor.

This habit, however, can cause problems when there is a need to re-purpose Rich Text and Enhanced Rich Text content elsewhere in the organization. When Engineering, for example, wants the content without all of Marketing’s 30pt Bodoni Bold fluff, we need a way to dispose of the Word HTML/XML formatting.

Have you ever looked at Word’s HTML? It has a beauty all its own. But good luck at editing it with anything other than the Word client application. Stripping the HTML with a blissfully simple “jQuery(html).text();” JavaScript won’t do in many cases, because it wipes out essential spaces and line returns as well. Regular Expressions will do the job however, but there’s a need for more than one or two.

Fortunately, some good people at 1st Class Media in Edinburgh posted a respectable set of expressions back in 2007 – http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php

function CleanWordHTML( str )
{
	str = str.replace(/<o:p>\s*<\/o:p>/g, "") ;
	str = str.replace(/<o:p>.*?<\/o:p>/g, "&nbsp;") ;
	str = str.replace( /\s*mso-[^:]+:[^;"]+;?/gi, "" ) ;
	str = str.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, "" ) ;
	str = str.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;
	str = str.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, "" ) ;
	str = str.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;
	str = str.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;
	str = str.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;
	str = str.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;
	str = str.replace( /\s*tab-stops:[^;"]*;?/gi, "" ) ;
	str = str.replace( /\s*tab-stops:[^"]*/gi, "" ) ;
	str = str.replace( /\s*face="[^"]*"/gi, "" ) ;
	str = str.replace( /\s*face=[^ >]*/gi, "" ) ;
	str = str.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, "" ) ;
	str = str.replace(/<(\w[^>]*) class=([^ |>]*)([^>]*)/gi, "<$1$3") ;
	str = str.replace( /<(\w[^>]*) style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;
	str = str.replace( /\s*style="\s*"/gi, '' ) ; 
	str = str.replace( /<SPAN\s*[^>]*>\s*&nbsp;\s*<\/SPAN>/gi, '&nbsp;' ) ; 
	str = str.replace( /<SPAN\s*[^>]*><\/SPAN>/gi, '' ) ; 
	str = str.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ; 
	str = str.replace( /<SPAN\s*>(.*?)<\/SPAN>/gi, '$1' ) ; 
	str = str.replace( /<FONT\s*>(.*?)<\/FONT>/gi, '$1' ) ;
	str = str.replace(/<\\?\?xml[^>]*>/gi, "") ; 
	str = str.replace(/<\/?\w+:[^>]*>/gi, "") ; 
	str = str.replace( /<H\d>\s*<\/H\d>/gi, '' ) ;
	str = str.replace( /<H1([^>]*)>/gi, '' ) ;
	str = str.replace( /<H2([^>]*)>/gi, '' ) ;
	str = str.replace( /<H3([^>]*)>/gi, '' ) ;
	str = str.replace( /<H4([^>]*)>/gi, '' ) ;
	str = str.replace( /<H5([^>]*)>/gi, '' ) ;
	str = str.replace( /<H6([^>]*)>/gi, '' ) ;
	str = str.replace( /<\/H\d>/gi, '<br>' ) ; //remove this to take out breaks where Heading tags were 
	str = str.replace( /<(U|I|STRIKE)>&nbsp;<\/\1>/g, '&nbsp;' ) ;
	str = str.replace( /<(B|b)>&nbsp;<\/\b|B>/g, '' ) ;
	str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
	str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
	str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
	//some RegEx code for the picky browsers
	var re = new RegExp("(<P)([^>]*>.*?)(<\/P>)","gi") ;
	str = str.replace( re, "<div$2</div>" ) ;
	var re2 = new RegExp("(<font|<FONT)([^*>]*>.*?)(<\/FONT>|<\/font>)","gi") ; 
	str = str.replace( re2, "<div$2</div>") ;
	str = str.replace( /size|SIZE = ([\d]{1})/g, '' ) ;
	return str ;
}
Advertisements