Sunday, 19 October 2014

RTF to HTML Conversion

Converting document files is never easier, I recently had requirement to convert some RTF text into Simple HTML using SQL Server.

SQL Server RTF to HTML Conversion Code Limitations:
      • Color tags are not handled
      • Only few List item tags handled
      • Some of the tags are marked as not necessary and replaced with nothing 

Script is for RTF to HTML converter
USE SQLCache
GO
------------------------------------------------------------------------------------------------------
---- Step 1 ---------- Create Table RTFFiles and import Text into Column RTFFileText ------------------
CREATE TABLE RTFFiles
(
FileID INT IDENTITY(1, 1),
RTFFileText NVARCHAR(MAX),
HTMLFileText NVARCHAR(MAX),
)
GO
-------------------------------------------------------------------------------------------------------
---- Step 2 ---------- Create Conversion Stored Procedure ---------------------------------------------
/*
Script By: Amna Asif for http://sqlcache.blogspot.com/
Purpose: To Convert RTF Text into Simple HTML through SQL Server
*/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE PROCEDURE [dbo].[ConvertRTFtoHTML] ( @RTFFileID INT )
AS
BEGIN
DECLARE @RTFFileText NVARCHAR(MAX)
DECLARE @fontSize VARCHAR(20)
DECLARE @fontType VARCHAR(20)
DECLARE @startfont VARCHAR(MAX)
--SELECT FileText from RTFFiles where fileid=@RTFFileID
---- Convert Bold Tags
SELECT @RTFFileText = REPLACE(REPLACE(REPLACE(RTFFileText, '\ \', '\'),'\b0', '</b>'), '\b', '<b>')
FROM RTFFiles
WHERE FileID = @RTFFileID
--- Addition for Specific Client
SELECT @RTFFileText = REPLACE(@RTFFileText, '\<b> - : </b>', ' - : ')
--- Convert Italic
SELECT @RTFFileText = REPLACE(REPLACE(@RTFFileText, '\i0', '</i>'), '\i', '<i>')
--- Convert Underline
SELECT @RTFFileText = REPLACE(REPLACE(REPLACE(@RTFFileText, '\ulnone','</u>'), '\ul0', '</u>'),'\ul', '<u>')
--- Convert Alignments
SELECT @RTFFileText = REPLACE(REPLACE(REPLACE(@RTFFileText, '\pard\\qc', '<div align=center>'), '\pard\\qr', '<div align=right>'), '\pard','<div align=left>')
--- Remove \pard from the tags so it doesn't get confused with \par.
SELECT @RTFFileText = REPLACE(@RTFFileText, '\pard', '')
--- Convert line breaks
SELECT @RTFFileText = REPLACE(@RTFFileText, '\par', '<br>')
--- Convert Tabs
SELECT @RTFFileText = REPLACE(REPLACE(REPLACE(@RTFFileText, '\par', ' '), '\(tab)', ''),'\tab', '&nbsp;')
--Convert \\ to \
SELECT @RTFFileText = REPLACE(REPLACE(@RTFFileText, '\\', '\'), '\ \', '\')
--- Convert \{ to to {
SELECT @RTFFileText = REPLACE(@RTFFileText, '\{', '{')
--Convert \) to }
SELECT @RTFFileText = REPLACE(@RTFFileText, '\} ', '}')
----Replace
--select @RTFFileText= replace(@RTFFileText,' ',' ')
--- Remove color tags because they are not handled
SELECT @RTFFileText = CASE WHEN @RTFFileText LIKE '%\colortbl%'
THEN REPLACE(@RTFFileText,
SUBSTRING(@RTFFileText, CHARINDEX('\colortbl', @RTFFileText),
CHARINDEX('}', @RTFFileText, CHARINDEX('\colortbl', @RTFFileText))- CHARINDEX('\colortbl', @RTFFileText)),
'')
ELSE @RTFFileText
END
SELECT @RTFFileText = REPLACE(@RTFFileText, '\nowidctlpar\tx288', '')
--- Set Font Size
SELECT @fontSize = CASE WHEN @RTFFileText LIKE '%{\f9\fs%'
THEN SUBSTRING(@RTFFileText,
CHARINDEX('{\f9\fs',@RTFFileText) + 7, 2)
ELSE 12
END
--- Set Font Type
SELECT @fonttype = CASE WHEN @RTFFileText LIKE '%\fprq%'
THEN SUBSTRING(@RTFFileText,
CHARINDEX('\fprq',@RTFFileText) + 7,
CHARINDEX(';', @RTFFileText,CHARINDEX('\fprq', @RTFFileText)+ 7)- ( CHARINDEX('\fprq', @RTFFileText)+ 7 ))
ELSE 'Arial'
END
--- Convert Font style
SELECT @RTFFileText = CASE WHEN @RTFFileText LIKE '%}}%'
THEN REPLACE(@RTFFileText, SUBSTRING(@RTFFileText, 1, CHARINDEX('}}', @RTFFileText)+ 1)
, ' <span style="font-family:'+ ISNULL(@fonttype, 'Arial') + ';font-size:' + ISNULL(@fontSize, '12')+ '"> ')
ELSE @RTFFileText
END
--- Close Span
SELECT @RTFFileText = REPLACE(@RTFFileText, '}}', '}</span>')
--- Remove Extra {}
SELECT @RTFFileText = REPLACE(@RTFFileText, '{}', '')
--------------------------------------- Start SubString Looping
--------------------------------------- Convert file text in sub Strings to deal with tags
DECLARE @totalSize INT,
@StartIndex INT,
@EndIndex INT,
@newString VARCHAR(MAX)= ''
SELECT @StartIndex = 1,
@EndIndex = 1,
@startfont = '',
@fontSize = 12
WHILE ( @StartIndex > 0
AND @EndIndex > 0
)
BEGIN
SELECT @StartIndex = ( SELECT CHARINDEX('{', @RTFFileText, @StartIndex) )
SELECT @EndIndex = ( SELECT CHARINDEX('}', @RTFFileText, @EndIndex) )
SELECT @startfont = REPLACE(ISNULL(SUBSTRING(@RTFFileText, @StartIndex, CASE WHEN @EndIndex > @StartIndex THEN @EndIndex - @StartIndex + 1
ELSE 0
END),
''), '{}', '')
SELECT @newstring = REPLACE(REPLACE(REPLACE(REPLACE(@startfont, '{\f9\fs18', '<span style="font-size:' + @fontSize + '"> '),
'}', '}</span>'),
'\fs18',
'<span style="font-size:'
+ @fontSize + '"> '), '}',
'}</span>')
SELECT @newstring = CASE WHEN @newstring LIKE '%\li0 \li288%'
THEN REPLACE(REPLACE(@newstring, '\li0 \li288', '<dd>'),
'}', '</dd>')
ELSE @newstring
END
SELECT @newstring = CASE WHEN @newstring LIKE '%\li0 %'
THEN REPLACE(REPLACE(@newstring, '\li0', '<dt>'),
'}', '</dt>')
ELSE @newstring
END
SELECT @newstring = CASE WHEN @newstring LIKE '%<b>%'
AND @newstring NOT LIKE '%</b>%'
THEN @newstring + '</b>'
ELSE @newstring
END
SELECT @StartIndex = CASE WHEN @StartIndex = 0 THEN -999
ELSE @EndIndex + 1
END
SELECT @EndIndex = CASE WHEN @EndIndex = 0 THEN -999
ELSE @EndIndex + 1
END
SELECT @RTFFileText = REPLACE(@RTFFileText, @startfont,
REPLACE(REPLACE(REPLACE(@newstring, '{', ''), '}', ''),'\rtf1\ansi', ''))
END
--------------------------------------- End SubString Looping
--------------------------------------- Start ListItem Looping
--------------------------------------- Add closing Tags of Lists
IF ( @RTFFileText LIKE '%<dl>%' )
BEGIN
WHILE ( @StartIndex < LEN(@RTFFileText) )
BEGIN
SELECT @StartIndex = REPLACE(( SELECT CHARINDEX('>td<', REVERSE(@RTFFileText)) - LEN(@RTFFileText)
), '-', '')
SELECT @EndIndex = REPLACE(( SELECT CHARINDEX('>dd/<', REVERSE(@RTFFileText)) - LEN(@RTFFileText)
), '-', '')
SELECT @RTFFileText = STUFF(@RTFFileText, @StartIndex - 3, 5,' <dl> <dt> ')
SELECT @RTFFileText = STUFF(@RTFFileText, @EndIndex + 3, 5, ' </dd> </dl> ')
SELECT @StartIndex = @EndIndex + @StartIndex
END
END
--------------------------------------- END ListItem Looping
--- Convert list item Tags
SELECT @RTFFileText = REPLACE(@RTFFileText, '\li0', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\li250', '<li>')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\li500', '<li>')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\li288', '<li>')
------------------------ Start --- Extra Tags ------------------------------ Remove extra tags that are not necessary to handle
SELECT @RTFFileText = REPLACE(REPLACE(REPLACE(@RTFFileText, '\fs18 ',''), '{', ''), '}', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\viewkind4\uc1', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\f0', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\fs9', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\f9', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\fs16', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\fs20', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\cf1', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\cf1\f1130/90 \cf0 \cf1\f1', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\cf0', '')
SELECT @RTFFileText = REPLACE(@RTFFileText, '\f1', '')
------------------------ End --- Extra Tags ------------------------------
--- Remove Extra Span Tags
SELECT @RTFFileText = REPLACE(@RTFFileText,
'<span style="font-size:12"> <br><span style="font-size:12"> <br><span style="font-size:12">',
'<span style="font-size:12"> <br>')
SELECT @RTFFileText = REPLACE(@RTFFileText, '<BR> <BR>', '<BR>')
--- Update HTMLFileText column of RTFFiles Table
UPDATE RTFFiles
SET HTMLFileText = @RTFFileText
WHERE FileID = @RTFFileID
END
---------------------------------------------------------------------------------------------------------
---- Step 3 ---------- Execute SP against FileID whose text is to be converted --------------------------
EXECUTE [dbo].[ConvertRTFtoHTML] @RTFFileID = 1