将包含Transcript文本的Word文档解析为Excel列的最快方法

rn0zuynd  于 2023-08-08  发布在  其他
关注(0)|答案(1)|浏览(87)

我有多个Word文档文件,每个文件都包含如下文本(未显示段落标记):

Some Title1              ' <--- Some title ending with paragraph mark
(Apr 3, 2023 - 9:00am)  ' <--- Date - time ending with paragraph mark
(Interviewee: Harry) ' <--- Interviewee name to pick to only add interviewee lines (name and text) to outrow array.
                        ' <--- blank line ending with paragraph mark
(00:00:00 - 00:00:02)   ' <--- timestamp ending with paragraph mark

Harry: Okay, thank you. ' <--- Speaker: Text ending with paragraph mark

(00:00:02 - 00:00:06)
Tom: Hi, Harry, hello. Are you okay?

(00:00:06 - 00:00:09)
Harry: Yeah, I'm good, thank you. How are you doing? Happy Monday to you.

(00:00:09 - 00:00:12)
Tom: It's a nice Monday today, so it's quite bright for a change.

字符串
由于有许多文档文件,我想复制整个内容(所有段落)从每个文档文件到excel工作表Sheet2,追加每个内容到最后一个非空白行。完成后,我想使用Excel中的TextToColumns功能将文本拆分为单独的列,如图所示:
| 日期时间|时间戳记|发言人|文字档| Text |
| ------------|------------|------------|------------| ------------ |
| (2023年4月3日-上午9:00)|(00:00:00 - 00:00:02)|哈利|好的谢谢||
| | | (00你好吗?|祝你周一快乐。| How are you doing? Happy Monday to you.|
| (2023年4月5日-晚上19:00)|(00:00:00 - 00:00:04)|吉尔|我很好。||
| | | (00:00:06 - 00:00:12) | Jill | I'm busy. |
...
目前我只能循环和复制粘贴的文档内容表。一旦合并到工作表中,我想将此内容转换到如上所示的表格中。另外,如果有一种方法可以将所有的文档内容收集到一个数组或ado记录集中,然后一次性将数组/记录集内容直接传输到工作表,这将加快代码速度并保存一些时间。

Option Explicit

Sub ParseTranscriptToExcelSheet()

    Application.ScreenUpdating = False
    Application.DisplayAlerts = False
    
    Dim wdApp As Object ' Word application
    Dim wdDoc As Object ' Word document
    Dim tbl As Object   ' Table
    Dim para As Object  ' Paragraph
    Dim row As Integer  ' Row index for the table
    Dim i As Long
    Dim oFileDialog As FileDialog
    Dim vSelectedItem As Variant
    
    ' declare worksheets
    Dim ws1 As Worksheet
    Dim ws2 As Worksheet
    
    Set ws1 = ThisWorkbook.Sheets(1) ' contains button to run code
    Set ws2 = ThisWorkbook.Sheets(2)
    
    ' Add a header row to the worksheet 2
    ws2.Range("A1:E1").Value = Array("Title", "DateTime", "Timestamp", "Speaker", "Text")
    
    ' Initialize the row index for the table
    row = ws2.Cells(ws2.Rows.Count, 1).End(xlUp).row + 1
    
    ' Open the Word document containing the transcript
    On Error Resume Next
    Set wdApp = GetObject(, "Word.Application")
    On Error GoTo 0
        
    If wdApp Is Nothing Then
        Set wdApp = CreateObject("Word.Application")
    End If
    
    With wdApp
        .Visible = False
    End With
            
'    ReDim sContent(1 To 1)
    Set oFileDialog = Application.FileDialog(msoFileDialogFilePicker)
    With oFileDialog
        .Title = "Select Word Files"
        .AllowMultiSelect = True
        .Filters.Add "Word files", "*.doc*", 1
        If .Show = -1 Then
            ws2.Activate
            For Each vSelectedItem In .SelectedItems
                Set wdDoc = wdApp.Documents.Open(vSelectedItem)
                With wdDoc
'                    sContent(UBound(sContent)) = .Content.formattedtext.text
                    .Content.Copy
                    ws2.Cells(row, 1).PasteSpecial (xlPasteValues)
                    Application.CutCopyMode = False
                    DoEvents
                    .Close savechanges:=False
                    row = ws2.Cells(ws2.Rows.Count, 1).End(xlUp).row + 1
                End With
'                ReDim Preserve sContent(1 To UBound(sContent) + 1) As String
            Next vSelectedItem
'            ReDim Preserve sContent(1 To UBound(sContent) - 1) As String
                
        Else
            MsgBox "No files selected"
        End If
    
    End With
    
    
    wdApp.Quit
    Set wdDoc = Nothing
    Set wdApp = Nothing
    Set ws1 = Nothing
    Set ws2 = Nothing
    
    Application.DisplayAlerts = True
    Application.ScreenUpdating = True
End Sub

wn9m85ua

wn9m85ua1#

试试这个:

Option Explicit

Sub ParseTranscriptToExcelSheet()
    Dim wdApp As Object ' Word application
    Dim wdDoc As Object ' Word document
    Dim allFiles As Collection, f, txt As String, arr, el, ub As Long
    Dim ws2 As Worksheet, nextRow As Long, x As Long, arr2, ln
    
    Set allFiles = SelectedFiles()
    If allFiles.Count = 0 Then Exit Sub
    
    Set ws2 = ThisWorkbook.Sheets(2)
    ws2.Range("A1:E1").Value = Array("Title", "DateTime", "Timestamp", "Speaker", "Text")
    nextRow = ws2.Cells(ws2.Rows.Count, 1).End(xlUp).row + 1
    
    Set wdApp = GetWordApp()
    
    For Each f In allFiles                     'loop over selected files
        Set wdDoc = wdApp.Documents.Open(f)    'open files
        txt = wdDoc.Range.Text                 'read content
        wdDoc.Close False
        arr = Split(txt, vbCr)                 'get array of lines/paras
        ub = UBound(arr)
        If ub > 0 Then
            ws2.Cells(nextRow, "A").Value = arr(0)  'fill the "header" info
            ws2.Cells(nextRow, "B").Value = arr(1)
            For x = 2 To UBound(arr)                'process rest of lines
                ln = Trim(arr(x))
                If ln Like "(*)" Then                       'timestamp?
                    ws2.Cells(nextRow, "C").Value = ln 
                ElseIf ln Like "*:*" Then                   'speaker text?           
                    arr2 = Split(ln, ":", 2)
                    ws2.Cells(nextRow, "D").Value = arr2(0) 'speaker
                    ws2.Cells(nextRow, "E").Value = arr2(1) 'content
                    nextRow = nextRow + 1
                End If
            Next x
        End If
        nextRow = nextRow + 1
    Next f
End Sub

'return a Collection of user-selected Word files
Function SelectedFiles() As Collection
    Dim f
    Set SelectedFiles = New Collection
    With Application.FileDialog(msoFileDialogFilePicker)
        .Title = "Select one or more Word Files"
        .AllowMultiSelect = True
        .Filters.Add "Word files", "*.doc*", 1
        If .Show = -1 Then
            For Each f In .SelectedItems
                SelectedFiles.Add f
            Next f
        End If
    End With
End Function

'Get a running Word instance, or start a new instance
Function GetWordApp() As Object
    On Error Resume Next
    Set GetWordApp = GetObject(, "Word.Application")
    On Error GoTo 0
    If GetWordApp Is Nothing Then
        Set GetWordApp = CreateObject("Word.Application") 'assuming this works ok...
    End If
    GetWordApp.Visible = True
End Function

字符串
编辑:这里有一个版本,它填充一个数组,并在最后写入Excel工作表

Sub ParseTranscriptToExcelSheet_Array()
    Dim wdApp As Object ' Word application
    Dim wdDoc As Object ' Word document
    Dim allFiles As Collection, f, txt As String, arr, el, ub As Long
    Dim ws2 As Worksheet, x As Long, arr2, ln, arrOut(), outRow As Long
    
    Set allFiles = SelectedFiles()
    If allFiles.Count = 0 Then Exit Sub
    
    Set ws2 = ThisWorkbook.Sheets(2)
    ws2.Cells.ClearContents 'for testing....
    
    ws2.Range("A1:E1").Value = Array("Title", "DateTime", "Timestamp", "Speaker", "Text")
   
    ReDim arrOut(1 To 20000, 1 To 5) 'array for output; guessing at max size....
    outRow = 1
    
    Set wdApp = GetWordApp()
    
    For Each f In allFiles                     'loop over selected files
        Set wdDoc = wdApp.Documents.Open(f)    'open files
        txt = wdDoc.Range.Text                 'read content
        wdDoc.Close False
        arr = Split(txt, vbCr)                 'get array of lines/paras
        ub = UBound(arr)
        If ub > 0 Then
            arrOut(outRow, 1) = arr(0)  'fill the "header" info
            arrOut(outRow, 2) = arr(1)
            For x = 2 To UBound(arr)            'process rest of lines
                ln = Trim(arr(x))
                If ln Like "(*)" Then           'timestamp?
                    arrOut(outRow, 3) = ln
                ElseIf ln Like "*:*" Then       'speaker text?
                    arr2 = Split(ln, ":", 2)
                    arrOut(outRow, 4) = arr2(0) 'speaker
                    arrOut(outRow, 5) = arr2(1) 'content
                    outRow = outRow + 1
                End If
            Next x
        End If
        outRow = outRow + 1
    Next f
    
    'any content to write?
    If outRow > 1 Then ws2.Cells(ws2.Rows.Count, 1).End(xlUp). _
           Offset(1).Resize(outRow, 5).Value = arrOut
    
End Sub

相关问题