XML Project
The task was to collect revision information as XML records from 477 legacy documents (most only in PDF format, some in Word doc or docx) in 44 collections over 5 releases, and also to convert all PDF files to Word files.
The resulting historical information is up on MSDN (example).
Each document (PDF or otherwise) has a Revision Summary table containing a set of release dates, revision numbers, and revision classes, for example:
Rather than do this by hand, I wrote some PowerShell and Word Basic scripts to extract the release information from each file’s Revision Summary table and save it as XML records, one per release per file. For each revision source directory:
- The first PowerShell script (PDFtoDoc.ps1) goes through a given directory and opens each PDF file in Word, then saves it as a doc file.
- A follow-on PowerShell script (WordExtractDir.ps1) opens each new doc or legacy docx file in a given directory, and executes a Word macro (ReturnDocInfo, in ReturnDocInfo.vba) to extract and incrementally write release information records to each document's own XML file.
- The Word macro gets the document short name (MS-XYZ) and its title, then finds the revision table and reads the last row. The Word macro then calls a subroutine (OutputDocInfo) that writes out XML records for each entry to that document's MS-XYZ.xml file.
After all the records were extracted from all documents for all releases, I then used Bash with its sed and awk tools to post-process the 477 document XML files, also creating two summary XML files (collection structure and contents).
I handed the set of 479 post-processed XML files to a subsequent tool writer who converted them into the format seen on MSDN.
PDFtoDoc.ps1 (PowerShell)
### # 1. Opens each .pdf file in the given path in Word # 2. Word saves that file in doc format # # Usage: PDFtoDoc directory-path ### $documents_path = $args[0] # for example, c:MyProtocolPDFs\Release2014 $saveasdoc = 0 # SaveAs format identifier $word_app = New-Object -ComObject Word.Application Get-ChildItem -Filter *.pdf -Path $documents_path | ForEach-Object { $document = $word_app.Documents.Open($_.FullName) $doc_filename = "$($_.DirectoryName)\$($_.BaseName).doc" Write-Host $doc_filename $document.SaveAs([ref] $doc_filename, [ref] $saveasdoc) $document.Close() } $word_app.Quit()
WordExtractDir.ps1 (PowerShell)
## # 1. Starts in a directory that contains either doc or docx files # 2. Determines which type is in this directory # 3. Opens each doc/docx file in Word # 4. Calls a Word macro that extracts the revision info and saves it into doc-specific XML files # # Usage: WordExtractDir directory-path ## $dir = $args[0] echo $dir $i = 0 $docs = 0 $docxs = 0 $extractMacro = "Normal.NewMacros.ReturnDocInfo" $word = New-Object -ComObject Word.Application $word.visible = $false echo "" Get-ChildItem -path $dir -recurse -include "*.doc" | % { $docs = $docs + 1 } Get-ChildItem -path $dir -recurse -include "*.docx" | % { $docxs = $docxs + 1 } if ($docs -gt 0) { $type = "doc" $num = $docs } else { $type = "docx" $num = $docxs } Get-ChildItem -path $dir -recurse -include "*.$type" | % { $doc = $word.documents.open($_.fullname) $results = $word.run($extractMacro) $doc.close() echo ([string] ($num - $i) + " - " + $_.Name) # counts down $i = $i + 1 } $word.Quit()
ReturnDocInfo.vba (Visual Basic for Applications)
Sub ReturnDocInfo() Dim dirDate As String ' Release subdirectory name dirDate = "2013-01-31" ' Manually set per release Dim bMore As Boolean ' True if there will be another Release concatenated; False if this is the last one bMore = True Dim curDoc As Word.Document Dim rng As Word.Range Dim sDocName As String Dim sTitleText As String Dim sTitle As String Dim sRow As String Dim sDate As String Dim sRevDate As String Dim sDateCell, sChangeCell Dim sChange As String Dim pTitle As Paragraph Debug.Print Debug.Print "-- " + dirDate + " -- "; Now() ' Show script starting time '--------- ' Get the protocol document name '--------- Set curDoc = ActiveDocument sDocName = curDoc.Name ' starts as "[MS-XYZ].doc" sDocName = Mid(sDocName, 2, Len(sDocName) - Len("].doc") - 1) '--------- ' Get the protocol document title '--------- Set rng = curDoc.Content() Set pTitle = rng.Paragraphs(1) sTitle = pTitle.Range.Text sTitleText = Mid(sTitle, 1, Len(sTitle) - 1) ' remove one trailing character, probably CR sTitle = Mid(sTitleText, Len(sDocName) + 6) ' remove leading "[MS-XYZ]: ." sTitle = Replace(sTitle, "", "") ' some kind of paragraph/eol marker? only occurs on two-line titles Debug.Print sDocName; " - "; sTitle '--------- ' Get the most recent publication date (last row in table) '--------- sDateCell = rng.Tables(1).Rows(rng.Tables(1).Rows.Count()).Cells(1) sDate = Mid(sDateCell, 1, Len(sDateCell) - 2) ' remove two trailing characters, probably CR & LF sRevDate = Mid(sDate, 7, 4) + "-" + Mid(sDate, 1, 2) + "-" + Mid(sDate, 4, 2) '--------- ' Get the change type '--------- sChangeCell = rng.Tables(1).Rows(rng.Tables(1).Rows.Count()).Cells(3) sChange = Mid(sChangeCell, 1, Len(sChangeCell) - 2) If sChange = "No change" Then sChange = "None" Debug.Print sRevDate; " - "; sChange '--------- ' Write out this XML record '--------- Call OutputDocInfo("C:\Output", dirDate, bMore, sDocName, sTitle, sRevDate, sChange) End Sub '--------------------------------------------------------------------- Sub OutputDocInfo(folderPath As String, dirDate As String, more As Boolean, docName As String, docTitle As String, docDate As String, docChange As String) Const sExtension = ".xml" Dim bFirstTime As Boolean Dim sAny As String Dim quote As String quote = """" Dim sBasePath As String sBasePath = "\\base-path\release-dir\" + dirDate Dim sIntroXML As String Dim sDocLoc As String Dim sPDFLoc As String sIntroXML = "<?xml version=" + quote + "1.0" + quote + " encoding=" + quote + "utf-8" + quote + " ?>" sWordLoc = " <DocumentFile Type=" + quote + "Word" + quote + " Location=" sPDFLoc = " <DocumentFile Type=" + quote + "PDF" + quote + " Location=" Dim sFileName As String sFileName = folderPath + "\" + docName + sExtension Set fso = CreateObject("Scripting.FileSystemObject") bFirstTime = Not fso.FileExists(sFileName) Set textStream = fso.OpenTextFile(sFileName, 8, True) ', ForAppending, Create) If bFirstTime Then textStream.writeline (sIntroXML) sAny = "<Protocol Name=" + quote + docName + quote + " Title=" + quote + docTitle + quote + ">" textStream.writeline (sAny) textStream.writeline (" <Releases>") End If sAny = " <Release PublishDate=" + quote + docDate + quote + " ProtocolRevision=" + quote + "None" + quote + " DocumentRevision=" + quote + docChange + quote + ">" textStream.writeline (sAny) sAny = sPDFLoc + quote + sBasePath + "\Downloads\[" + docName + "].pdf" + quote + " />" textStream.writeline (sAny) sAny = sWordLoc + quote + sBasePath + "\Documents\[" + docName + "].doc" + quote + " />" textStream.writeline (sAny) sAny = " </Release>" textStream.writeline (sAny) If Not more Then sAny = " </Releases>" textStream.writeline (sAny) sAny = "</Protocol>" textStream.writeline (sAny) End If End Sub