Sunday, 7 September 2014

SQL Server Import\ Export Image\Document Files

Documents storing inside database might be a requirement of SQL Server driven application. SQL Server uses nvarchar for unicode texts whereas varbinary data type for storing document files (like images, word documents, pdfs) in database.
Suppose you have a folder containing various document files (like images, word documents, pdfs, html) and it is required to perform certain actions to upload data from file system in to the database.

1.        List all files with their path in a folder and sub-folders to a Table
--------------------------------------------------------------------- 
CREATE PROCEDURE GetAllDirectoryFilesandFolders -- GetAllDirectoryFilesandFolders @RootDir='E:\Post\SqlCache'
(
@RootDir VARCHAR(1000)='E:\SQLCache'
)
AS
BEGIN

DECLARE  @RowIndex  INT
,        @SearchDir VARCHAR(1000)
  

IF EXISTS ( SELECT 1 FROM SQLCacheDocuments.SYS.TABLES WHERE name ='DocumentsRepository' ) 
DROP TABLE SQLCacheDocuments.dbo.DocumentsRepository


CREATE TABLE SQLCacheDocuments.dbo.DocumentsRepository
(
  FileID INT IDENTITY(1,1),
  DocumentFilePath VARCHAR(max),
  isfile INT,
  DocumentFileName VARCHAR(max),
  FileDepth INT
)

--- Get Root Direcotry Files\Folders
INSERT INTO DocumentsRepository(DocumentFileName, FileDepth,isFile)
EXEC MASTER.SYS.xp_dirtree @RootDir,1,1



--- If No file\Folder exits then exit 
IF NOT exists( SELECT * FROM DocumentsRepository )
RETURN


--- Update File\Folder Path
UPDATE DocumentsRepository
SET DocumentFilePath = @RootDir + '\' + DocumentFileName


--- Next root directory
SELECT @RowIndex=min(FileID) FROM DocumentsRepository


--- Get Sub Direcotry Files\Folders
WHILE 1=1
BEGIN

   SELECT @SearchDir = DocumentFilePath
   FROM DocumentsRepository
   WHERE FileID=@RowIndex

   INSERT INTO DocumentsRepository(DocumentFileName, FileDepth,isfile)
   EXEC MASTER.SYS.xp_dirtree @SearchDir,1,1

   UPDATE DocumentsRepository
   SET DocumentFilePath = @SearchDir + '\' + DocumentFileName
   WHERE DocumentFilePath is null

   SET @RowIndex = @RowIndex + 1

   -- Exit if current Rowindex is Max FileID
   IF ( @RowIndex > (SELECT max(FileID) FROM DocumentsRepository) or @RowIndex is null )
   BREAK

END


SELECT SUM(CASE ISFILE WHEN 0 THEN 1 ELSE 0 END )  TolalFolders
,      SUM(CASE ISFILE WHEN 1 THEN 1 ELSE 0 END )  TolalFiles
FROM  SQLCacheDocuments.dbo.DocumentsRepository


Delete
FROM  SQLCacheDocuments.dbo.DocumentsRepository
WHERE isfile=0


SELECT FileID, DocumentFilePath, DocumentFileName
FROM SQLCacheDocuments.dbo.DocumentsRepository
ORDER BY FileID

END
--------------------------------------------------------------------- 

2.       Divide files into two Tables based on operations
                                                   i.      Files (like Images, PDFs, Word documents ) which are inserted as binary
                                                 ii.       Files (text, HTML) whose content is inserted into the database
---------------------------------------------------------------------
SELECT * INTO SQLCacheDocuments.dbo.TextDocuments
FROM SQLCacheDocuments.dbo.DocumentsRepository
WHERE DocumentFileName like '%.html'
  OR  DocumentFileName like '%.TXT'
  
Alter table SQLCacheDocuments.dbo.TextDocuments ADD FileText NVARCHAR(max)

---------------------------------------

SELECT * INTO SQLCacheDocuments.dbo.HandDocuments
FROM SQLCacheDocuments.dbo.DocumentsRepository
WHERE DocumentFileName not like '%.html'
 AND  DocumentFileName not like '%.TXT'

ALTER TABLE SQLCacheDocuments.dbo.HandDocuments ADD FileBinary VARBINARY(max) 
---------------------------------------------------------------------

3. Import files (like Images, PDFs, Word documents ) as binary into the database table
--------------------------------------------------------------------- 
;WITH DocumentFiles
AS
(
SELECT FileID, DocumentFilePath
FROM SQLCacheDocuments.dbo.HandDocuments
WHERE FileBinary IS NULL
)
SELECT ' UPDATE SQLCacheDocuments.dbo.HandDocuments
                                 SET FileBinary= tempImg.BULKCOLUMN
                                  FROM OPENROWSET(BULK N''' + DocumentFilePath + ''', SINGLE_BLOB) as tempImg, SQLCacheDocuments.dbo.HandDocuments FileBinary
                                  WHERE CONVERT(int,FileBinary.FileID)='+ CONVERT(VARCHAR(200),FileID)+' GO '
FROM DocumentFiles
ORDER BY FileID
---------------------------------------------------------------------

4. Import content of files as text into the database table
--------------------------------------------------------------------- 
;WITH DocumentFiles
AS
(
SELECT FileID, DocumentFilePath
FROM SQLCacheDocuments.dbo.TextDocuments
WHERE FileText IS NULL
)
SELECT ' UPDATE SQLCacheDocuments.dbo.TextDocuments
                                 SET FileText= tempImg.BULKCOLUMN
                                  FROM OPENROWSET(BULK N''' + DocumentFilePath + ''', SINGLE_CLOB) as tempImg, SQLCacheDocuments.dbo.TextDocuments TextNote
                                  WHERE CONVERT(int,TextNote.FileID)='+ CONVERT(VARCHAR(200),FileID)+' GO '
FROM DocumentFiles
ORDER BY FileID
---------------------------------------------------------------------

5. Verify files binary by converting back to OS file
---------------------------------------------------------------------
;WITH DocumentFiles
AS
(
SELECT FileID, DocumentFileName
FROM SQLCacheDocuments.dbo.HandDocuments 
WHERE FILEID=1
)
SELECT ' EXEC master..xp_cmdshell ''BCP "SELECT FileBinary FROM SQLCacheDocuments.dbo. HandDocuments WHERE FileID ='
                        +Cast(FILEID as varchar(50))+'"  queryout "E:\Post\'+DocumentFileName
                        +'" -T -N -SsqlserverR2'''
FROM DocumentFiles
ORDER BY FileID
---------------------------------------------------------------------


6. Validate content of files by comparing OS file text and column value of database table.

7. Check files size and length of documents 
---------------------------------------------------------------------
SELECT (CAST(DATALENGTH(FileBinary)AS BIGINT) /1024.00) FileSize,* FROM SQLCacheDocuments.dbo.HandDocuments         
ORDER BY CAST(DATALENGTH(FileBinary) AS BIGINT) DESC       
        
SELECT Len(FileText),* FROM SQLCacheDocuments.dbo.TextDocuments
ORDER BY CAST(Len(FileText) AS BIGINT) DESC 
---------------------------------------------------------------------