XML Project

The task was to collect revision information as XML records from 477 legacy documents (most only in PDF format, some in Word doc or docx) in 44 collections over 5 releases, and also to convert all PDF files to Word files.  

The resulting historical information is up on MSDN (example).

Each document (PDF or otherwise) has a Revision Summary table containing a set of release dates, revision numbers, and revision classes, for example:

Rather than do this by hand, I wrote some PowerShell and Word Basic scripts to extract the release information from each file’s Revision Summary table and save it as XML records, one per release per file.  For each revision source directory:

  1. The first PowerShell script (PDFtoDoc.ps1) goes through a given directory and opens each PDF file in Word, then saves it as a doc file.  
  2. A follow-on PowerShell script (WordExtractDir.ps1) opens each new doc or legacy docx file in a given directory, and executes a Word macro (ReturnDocInfo, in ReturnDocInfo.vba) to extract and incrementally write release information records to each document's own XML file.
    1. The Word macro gets the document short name (MS-XYZ) and its title, then finds the revision table and reads the last row.  The Word macro then calls a subroutine (OutputDocInfo) that writes out XML records for each entry to that document's MS-XYZ.xml file.

After all the records were extracted from all documents for all releases, I then used Bash with its sed and awk tools to post-process the 477 document XML files, also creating two summary XML files (collection structure and contents).  

I handed the set of 479 post-processed XML files to a subsequent tool writer who converted them into the format seen on MSDN.


PDFtoDoc.ps1 (PowerShell)

###
#  1. Opens each .pdf file in the given path in Word
#  2. Word saves that file in doc format
#
#  Usage: PDFtoDoc directory-path
###

$documents_path = $args[0]     # for example, c:MyProtocolPDFs\Release2014

$saveasdoc = 0                 # SaveAs format identifier

$word_app = New-Object -ComObject Word.Application

Get-ChildItem -Filter *.pdf -Path $documents_path | ForEach-Object {   

    $document = $word_app.Documents.Open($_.FullName)

    $doc_filename = "$($_.DirectoryName)\$($_.BaseName).doc"

    Write-Host $doc_filename 
    $document.SaveAs([ref] $doc_filename, [ref] $saveasdoc)

    $document.Close()
}

$word_app.Quit()

 


WordExtractDir.ps1 (PowerShell)

##
#  1. Starts in a directory that contains either doc or docx files
#  2. Determines which type is in this directory
#  3. Opens each doc/docx file in Word
#  4. Calls a Word macro that extracts the revision info and saves it into doc-specific XML files
#
#  Usage:  WordExtractDir directory-path
##


$dir = $args[0]

echo $dir

$i = 0
$docs = 0
$docxs = 0


$extractMacro = "Normal.NewMacros.ReturnDocInfo"

$word = New-Object -ComObject Word.Application
$word.visible = $false

echo ""

Get-ChildItem -path $dir -recurse -include "*.doc" | % {  
    $docs = $docs + 1     
}

Get-ChildItem -path $dir -recurse -include "*.docx" | % {  
    $docxs = $docxs + 1     
}

if ($docs -gt 0) { 
    $type = "doc"
    $num = $docs
} else {
    $type = "docx"    
    $num = $docxs
}


Get-ChildItem -path $dir -recurse -include "*.$type" | % {             

    $doc = $word.documents.open($_.fullname)

    $results = $word.run($extractMacro)  

    $doc.close()

    echo ([string] ($num - $i) + " - " + $_.Name)   # counts down
    $i = $i + 1
}

$word.Quit()

ReturnDocInfo.vba (Visual Basic for Applications)

Sub ReturnDocInfo()  

    Dim dirDate As String  ' Release subdirectory name
    dirDate = "2013-01-31" ' Manually set per release

    Dim bMore As Boolean   ' True if there will be another Release concatenated; False if this is the last one
    bMore = True
            
    Dim curDoc As Word.Document
                
    Dim rng As Word.Range
    Dim sDocName As String
    Dim sTitleText As String
    Dim sTitle As String
    Dim sRow As String
    Dim sDate As String
    Dim sRevDate As String
    Dim sDateCell, sChangeCell
    Dim sChange As String
    
    Dim pTitle As Paragraph
    
Debug.Print
Debug.Print "-- " + dirDate + " -- "; Now()            ' Show script starting time
    
'---------
' Get the protocol document name
'---------
    Set curDoc = ActiveDocument
    sDocName = curDoc.Name                             ' starts as "[MS-XYZ].doc"
    sDocName = Mid(sDocName, 2, Len(sDocName) - Len("].doc") - 1)
    
'---------
' Get the protocol document title
'---------
    Set rng = curDoc.Content()
    
    Set pTitle = rng.Paragraphs(1)
    sTitle = pTitle.Range.Text
    sTitleText = Mid(sTitle, 1, Len(sTitle) - 1)  ' remove one trailing character, probably CR
    sTitle = Mid(sTitleText, Len(sDocName) + 6)   ' remove leading "[MS-XYZ]: ."
    sTitle = Replace(sTitle, "", "")             ' some kind of paragraph/eol marker? only occurs on two-line titles
        
Debug.Print sDocName; " - "; sTitle
    
'---------
' Get the most recent publication date (last row in table)
'---------
    sDateCell = rng.Tables(1).Rows(rng.Tables(1).Rows.Count()).Cells(1)
    sDate = Mid(sDateCell, 1, Len(sDateCell) - 2)  ' remove two trailing characters, probably CR & LF
    sRevDate = Mid(sDate, 7, 4) + "-" + Mid(sDate, 1, 2) + "-" + Mid(sDate, 4, 2)
    
'---------
' Get the change type
'---------
    sChangeCell = rng.Tables(1).Rows(rng.Tables(1).Rows.Count()).Cells(3)
    sChange = Mid(sChangeCell, 1, Len(sChangeCell) - 2)
    If sChange = "No change" Then sChange = "None" 
    
Debug.Print sRevDate; " - "; sChange

'---------
' Write out this XML record
'---------
Call OutputDocInfo("C:\Output", dirDate, bMore, sDocName, sTitle, sRevDate, sChange)
    
End Sub

'---------------------------------------------------------------------

Sub OutputDocInfo(folderPath As String, dirDate As String, more As Boolean, docName As String, docTitle As String, docDate As String, docChange As String)

    Const sExtension = ".xml"
    
    Dim bFirstTime As Boolean
    Dim sAny As String
    
    Dim quote As String
    quote = """"
    
    Dim sBasePath As String
    sBasePath = "\\base-path\release-dir\" + dirDate
    
    Dim sIntroXML As String
    Dim sDocLoc As String
    Dim sPDFLoc As String
    
    sIntroXML = "<?xml version=" + quote + "1.0" + quote + " encoding=" + quote + "utf-8" + quote + " ?>"
    sWordLoc = "      <DocumentFile Type=" + quote + "Word" + quote + " Location="
    sPDFLoc = "      <DocumentFile Type=" + quote + "PDF" + quote + " Location="
    
    
    Dim sFileName As String
    sFileName = folderPath + "\" + docName + sExtension
    
    Set fso = CreateObject("Scripting.FileSystemObject")
    
    bFirstTime = Not fso.FileExists(sFileName)
    
    Set textStream = fso.OpenTextFile(sFileName, 8, True)   ', ForAppending, Create)
    
    If bFirstTime Then
        textStream.writeline (sIntroXML)
        sAny = "<Protocol Name=" + quote + docName + quote + " Title=" + quote + docTitle + quote + ">"
        textStream.writeline (sAny)
        textStream.writeline ("  <Releases>")
    End If
            
    sAny = "    <Release PublishDate=" + quote + docDate + quote + " ProtocolRevision=" + quote + "None" + quote + " DocumentRevision=" + quote + docChange + quote + ">"
    textStream.writeline (sAny)
    
    sAny = sPDFLoc + quote + sBasePath + "\Downloads\[" + docName + "].pdf" + quote + " />"
    textStream.writeline (sAny)
    
    sAny = sWordLoc + quote + sBasePath + "\Documents\[" + docName + "].doc" + quote + " />"
    textStream.writeline (sAny)
    
    sAny = "    </Release>"
    textStream.writeline (sAny)
    
    If Not more Then
        sAny = "  </Releases>"
        textStream.writeline (sAny)
   
        sAny = "</Protocol>"
        textStream.writeline (sAny)
    End If
   
End Sub