Clean MS Word formatting from HTML


(Anthony) #1

I have pages with HTML inputs that users have a tendency to paste directly into from Word, and the editor I'm currently using doesn't have any help to clean it up. I'm hoping to replace the editor with something better, but in the meanwhile I was trawling around for hours looking for a decent way to clean up MS Word formatting in Javascript. I found several solutions that had a fair go at it, but nothing that really worked well until I spotted this on an obscure post somewhere.

 

It took me so long to find it and - for my needs at least - it works so well I thought I'd share and hopefully save someone else a few hours!

 

// removes MS Office generated guff
function cleanHTML(input) {
  // 1. remove line breaks / Mso classes
  var stringStripper = /(\n|\r| class=(")?Mso[a-zA-Z]+(")?)/g; 
  var output = input.replace(stringStripper, ' ');
  // 2. strip Word generated HTML comments
  var commentSripper = new RegExp('<!--(.*?)-->','g');
  var output = output.replace(commentSripper, '');
  var tagStripper = new RegExp('<(/)*(meta|link|span|\\?xml:|st1:|o:|font)(.*?)>','gi');
  // 3. remove tags leave content if any
  output = output.replace(tagStripper, '');
  // 4. Remove everything in between and including tags '<style(.)style(.)>'
  var badTags = ['style', 'script','applet','embed','noframes','noscript'];

for (var i=0; i< badTags.length; i++) {
tagStripper = new RegExp(’<’+badTags[i]+’.?’+badTags[i]+’(.?)>’, ‘gi’);
output = output.replace(tagStripper, ‘’);
}
// 5. remove attributes ’ style="…"’
var badAttributes = [‘style’, ‘start’];
for (var i=0; i< badAttributes.length; i++) {
var attributeStripper = new RegExp(’ ’ + badAttributes[i] + ‘="(.*?)"’,‘gi’);
output = output.replace(attributeStripper, ‘’);
}
return output;
}