View Single Post
 
Old 12-28-2021, 02:54 PM
donlincolnmsof donlincolnmsof is offline Windows 7 64bit Office 2003
Advanced Beginner
 
Join Date: Oct 2011
Posts: 36
donlincolnmsof is on a distinguished road
Default Extract data from word html document.

Hello

I’m looking to get some help on the following macro. It has 3 searches to perform and then extract data if HARD CODED date of dec-27-21 with a one space at the end is found

So e.g Its dec-27-21#

# meaning as an example that there is an white space at the end of the date.

Its searching for <\!doctype html\> beginning and end of document tag and then within that tag it searching for title and then extracting that title line and then if it find date dec-27-21# then it the beginning of RECORD and the end of record is <td width=""130"" align=""right"" style=""white-space:nowrap""> and its extracting that out of the document where it finds it.

I’m not familiar with putting the current date in a variable and then calling up that variable in the macro if that can be done then it will be great, but the way it is I have the date hard coded and I change it from time to time by editing the macro, and that bit extra to remember to change the date as to every time when I run the macro.

Input and output data files can be found on following link

Update your browser to use Google Drive, Docs, Sheets, Sites, Slides, and Forms - Google Drive Help

Any help fixing this macro will be greatly appreciated.

Thanks.

Sub A_News_Fast()
'
'
Selection.WholeStory
Selection.Delete Unit:=wdCharacter, Count:=1
Selection.Paste
Selection.HomeKey Unit:=wdStory


Dim aRng As Range, aDoc As Document, aDoc2 As Document
Dim aRngInner As Range, sText As String, sTitle As String

Set aDoc = ActiveDocument
Set aRng = aDoc.Range
With aRng.Find
.ClearFormatting
.Text = "\<\!doctype html\>*\</html\>"
.Forward = True
.Wrap = wdFindStop
.MatchCase = False
.MatchWildcards = True
Do While .Execute = True


'Use this range to find the first title instance inside it
Set aRngInner = aRng.Duplicate
With aRngInner.Find
.Text = "\<title\>*\</title\>"
.Forward = True
.Wrap = wdFindStop
.MatchCase = False
.MatchWildcards = True
If .Execute = True Then
sTitle = aRngInner.Text
Else
sTitle = "Title Not Found"
End If
End With


'Now use the same range to find all the phone numbers
Set aRngInner = aRng.Duplicate
With aRngInner.Find
.Text = "\dec-27-21 \*\<td width=""130"" align=""right"" style=""white-space:nowrap"">\>"
.Forward = True
.Wrap = wdFindStop
.MatchCase = False
.MatchWildcards = True
Do While .Execute = True
Debug.Print aRngInner.Text, sTitle
sText = sText & vbCr & aRngInner.Text & vbTab & sTitle
aRngInner.Collapse Direction:=wdCollapseEnd
aRngInner.End = aRng.End
Loop
End With
aRng.Collapse Direction:=wdCollapseEnd
If Len(sText) > 0 Then sText = sText & vbCr
Loop
End With



If Len(sText) > 0 Then
Set aDoc2 = Documents.Add(Visible:=True)
aDoc2.Range.Text = sText
Else
MsgBox "No hits"
End If


End Sub
Reply With Quote