Thread: PDF to Excel
View Single Post
 
Old 12-15-2013, 01:35 AM
macropod's Avatar
macropod macropod is offline Windows 7 32bit Office 2010 32bit
Administrator
 
Join Date: Dec 2010
Location: Canberra, Australia
Posts: 21,963
macropod has a reputation beyond reputemacropod has a reputation beyond reputemacropod has a reputation beyond reputemacropod has a reputation beyond reputemacropod has a reputation beyond reputemacropod has a reputation beyond reputemacropod has a reputation beyond reputemacropod has a reputation beyond reputemacropod has a reputation beyond reputemacropod has a reputation beyond reputemacropod has a reputation beyond repute
Default

I'd suggest saving the PDF as a text file, then opening it Word and running the following Word macro:
Code:
Sub ParsePDFData()
Application.ScreenUpdating = False
With ActiveDocument.Range
  .Paragraphs.First.Range.Delete
  .Paragraphs.First.Range.Delete
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Forward = True
    .Wrap = wdFindContinue
    .Format = False
    .MatchWildcards = True
    .Text = "^13[!^13]@^13[!^13]@^13[!^13]@^13^12^13[!^13]@^13"
    .Replacement.Text = "^p"
    .Execute Replace:=wdReplaceAll
    .Text = "^13[!^13]@^13[!^13]@^13[!^13]@^13^12^13"
    .Replacement.Text = "^p"
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{1,}^13"
    .Execute Replace:=wdReplaceAll
    DoEvents
    .Text = "^13{2,}"
    .Replacement.Text = "^p"
    .Execute Replace:=wdReplaceAll
    DoEvents
    .Text = "(^13[0-9]{1,}>)([!$]@)($[!^13]{1,})"
    .Replacement.Text = "\1^t\2^t^t\3"
    .Execute Replace:=wdReplaceAll
    DoEvents
    .Text = "[0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4}"
    .Replacement.Text = "^t^&"
    .Execute Replace:=wdReplaceAll
    DoEvents
    .Text = "(EACH )(^t)(^t$)"
    .Replacement.Text = "\2\1\3"
    .Execute Replace:=wdReplaceAll
    DoEvents
    .Text = "([0-9]{3,}^t[!^t]@^t)([!0-9])"
    .Replacement.Text = "\1^t^t\2"
    .Execute Replace:=wdReplaceAll
    DoEvents
    .Text = "($[0-9.]{4,}) ($[!^13]{1,})"
    .Replacement.Text = "\1^t\2"
    .Execute Replace:=wdReplaceAll
    DoEvents
    .Text = "^t[ ]{1,}"
    .Replacement.Text = "^t"
    .Execute Replace:=wdReplaceAll
    DoEvents
    .Text = "[ ]{1,}^t"
    .Replacement.Text = "^t"
    .Execute Replace:=wdReplaceAll
  End With
  .Copy
End With
Call Export
Application.ScreenUpdating = True
End Sub
 
Sub Export()
Dim xlApp As Object, xlWkBk As Object
Set xlApp = CreateObject("Excel.Application")
xlApp.Visible = True
xlApp.ScreenUpdating = False
Set xlWkBk = xlApp.Workbooks.Add
With xlWkBk.Sheets(1)
  .Range("A1").PasteSpecial Paste:=-4163 'xlPasteValues
  .Columns.AutoFit
  .Columns("A:A").ColumnWidth = 8
  .Range("A1").Select
End With
xlApp.ScreenUpdating = True
Set xlWkBk = Nothing: Set xlApp = Nothing
End Sub
The result will be an Excel worksheet containing the data, all nicely aligned. As coded, the only substantive difference is that the red values and cross-out values are shifted one column to the right, so that all the current prices are in the same column.

Note: With 800 pages of data to process, the code will take some time to complete.
__________________
Cheers,
Paul Edstein
[Fmr MS MVP - Word]

Last edited by macropod; 12-15-2013 at 04:15 AM. Reason: Enhanced XL output
Reply With Quote