[chatGPT] Write code with chatGPT (2) – extract the content between the specified characters in the doc document, and save the content to the corresponding independent unit of excel.

# This is a sample Python script.

# Press Shift + F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
”’
import pandas as pd
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams, LTTextBoxHorizontal

# The path of the PDF file that needs to be parsed
pdf_file_path = ‘D:\SVN\kernel_project\Product Management\Product Technical Specifications\Autosar\CP\R4.2.2\AUTOSAR_SWS_OS.pdf’

def main():
# Open the PDF file and parse the content
with open(pdf_file_path, ‘rb’) as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)

# list for saving PDF content
content_list = []

# Traverse the PDF pages, looking for specific fields
for page in PDFPage.create_pages(doc):
interpreter. process_page(page)
layout = device. get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBoxHorizontal):
# Here you need to set the fields to be searched and the corresponding conditions according to the actual situation
if ‘[SWS_Os’ in lt_obj.get_text() and ‘? ( )’ in lt_obj.get_text():
# Split the matched fields by spaces, and remove redundant spaces and newlines
fields = [field.strip() for field in lt_obj.get_text().split(‘ ‘) if field.strip()]
# add the field to the content list
content_list.append(fields)

# Save the content to an Excel table
df = pd. DataFrame(content_list)
df.to_excel(‘output.xlsx’, index=False, header=False)
”’

”’
from docx import Document
import xlwt

def extract_fields_from_word(docx_path, fields, output_path):
doc = Document(docx_path)

# Create Excel workbooks and worksheets
workbook = xlwt. Workbook()
worksheet = workbook.add_sheet(‘Extracted Fields’)

# Set Excel table header
worksheet.write(0, 0, ‘Word File’)
for row, field in enumerate(fields):
worksheet. write(row + 1, 0, field)

# Extract field content and save to Excel sheet
for col, field in enumerate(fields):
worksheet. write(0, col + 1, field)

for paragraph in doc.paragraphs:
if field in paragraph.text:
extracted_text = paragraph.text.replace(field, “”).strip()
worksheet.write(fields.index(field) + 1, col + 1, extracted_text)

# save the excel sheet
workbook. save(output_path)

# use example
def main():
docx_path = “D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.docx” # Replace with the actual Word document path
fields = [‘[SWS_Os’, ‘? ( )’] # replace with a list of fields to extract
output_path = “D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.xls” # Replace with the output Excel table path

extract_fields_from_word(docx_path, fields, output_path)
”’

import mysql.connector
import xlwt
from docx import Document

”’
def extract_text_between_fields(docx_path, field1, field2, output_path):
doc = Document(docx_path)
extracted_text = “”
is_between_fields = False

for paragraph in doc.paragraphs:
#if field1 in paragraph.text and field2 in paragraph.text:
if field1 in paragraph.text:
extracted_text + = paragraph.text + “\\

is_between_fields = True
####
elif field2 in paragraph.text:
extracted_text + = paragraph.text + “\\

is_between_fields = False
elif is_between_fields:
extracted_text + = paragraph.text + “\\

mydb = mysql.connector.connect(
host=”localhost”, # database host address
user=”yourusername”, # database username
passwd=”yourpassword” # database password
)

print(mydb)
####
# Create Excel workbooks and worksheets
try:
workbook = xlwt. Workbook()
worksheet = workbook.add_sheet(‘Extracted Text’)

# Save the extracted text to a cell in an Excel sheet
worksheet.write(0, 0, ‘Extracted Text’)
worksheet. write(1, 0, extracted_text)

# save the excel sheet
workbook. save(output_path)
print(“Excel table has been saved to”,output_path)

except Exception as e:
print(“An error occurred while creating the Excel table”, str(e))
”’

def extract_text_between_fields(docx_path, field1, field2, output_path):
doc = Document(docx_path)
extracted_text = []
is_between_fields = False

for paragraph in doc.paragraphs:
#if field1 in paragraph.text and field2 in paragraph.text:
if field1 in paragraph.text:
extracted_text.append(paragraph.text)
is_between_fields = True
”’elif field2 in paragraph.text:
extracted_text + = paragraph.text + “\\

is_between_fields = False
elif is_between_fields:
extracted_text + = paragraph.text + “\\

mydb = mysql.connector.connect(
host=”localhost”, # database host address
user=”yourusername”, # database username
passwd=”yourpassword” # database password
)

print(mydb)
”’
# Create Excel workbooks and worksheets
try:
workbook = xlwt. Workbook()
worksheet = workbook.add_sheet(‘Extracted Text’)

# Save the extracted text to a cell in an Excel sheet
for i, text in enumerate(extracted_text):
worksheet. write(i, 0, text)

# save the excel sheet
workbook. save(output_path)
print(“Excel table has been saved to”,output_path)

except Exception as e:
print(“An error occurred while creating the Excel table”, str(e))

# use example
def main():
docx_path = “D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.docx” # Replace with the actual Word document path
field1 = “[SWS_Os_” # Replace with the ID of field 1
field2 = “?” # Replace with the ID of field 2
output_path = “D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS_1.1.xls” # Replace with the output Excel table path

extract_text_between_fields(docx_path, field1, field2, output_path)
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f’Hi, {name}’) # Press Ctrl + F8 to toggle the breakpoint.

# Press the green button in the gutter to run the script.
if __name__ == ‘__main__’:
print_hi(‘PyCharm’)
main()

# See PyCharm help at https://www.jetbrains.com/help/pycharm/

# This is a sample Python script.

# Press Shift + F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
'''
import pandas as pd
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams, LTTextBoxHorizontal

# The path of the PDF file that needs to be parsed
pdf_file_path = 'D:\SVN\kernel_project\Product Management\Product Technical Specifications\Autosar\CP\R4.2.2\AUTOSAR_SWS_OS.pdf'

def main():
    # Open the PDF file and parse the content
    with open(pdf_file_path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # list for saving PDF content
        content_list = []

        # Traverse the PDF pages, looking for specific fields
        for page in PDFPage.create_pages(doc):
            interpreter. process_page(page)
            layout = device. get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBoxHorizontal):
                    # Here you need to set the fields to be searched and the corresponding conditions according to the actual situation
                    if '[SWS_Os' in lt_obj.get_text() and '? ( )' in lt_obj.get_text():
                        # Split the matched fields by spaces, and remove redundant spaces and newlines
                        fields = [field.strip() for field in lt_obj.get_text().split(' ') if field.strip()]
                        # add the field to the content list
                        content_list.append(fields)

    # Save the content to an Excel table
    df = pd. DataFrame(content_list)
    df.to_excel('output.xlsx', index=False, header=False)
'''

'''
from docx import Document
import xlwt


def extract_fields_from_word(docx_path, fields, output_path):
    doc = Document(docx_path)

    # Create Excel workbooks and worksheets
    workbook = xlwt. Workbook()
    worksheet = workbook.add_sheet('Extracted Fields')

    # Set Excel table header
    worksheet.write(0, 0, 'Word File')
    for row, field in enumerate(fields):
        worksheet. write(row + 1, 0, field)

    # Extract field content and save to Excel sheet
    for col, field in enumerate(fields):
        worksheet. write(0, col + 1, field)

        for paragraph in doc.paragraphs:
            if field in paragraph.text:
                extracted_text = paragraph.text.replace(field, "").strip()
                worksheet.write(fields.index(field) + 1, col + 1, extracted_text)

    # save the excel sheet
    workbook. save(output_path)


# use example
def main():
    docx_path = "D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.docx" # Replace with the actual Word document path
    fields = ['[SWS_Os', '? ( )'] # replace with a list of fields to extract
    output_path = "D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.xls" # Replace with the output Excel table path

    extract_fields_from_word(docx_path, fields, output_path)
'''

import mysql.connector
import xlwt
from docx import Document

'''
def extract_text_between_fields(docx_path, field1, field2, output_path):
    doc = Document(docx_path)
    extracted_text = ""
    is_between_fields = False

    for paragraph in doc.paragraphs:
        #if field1 in paragraph.text and field2 in paragraph.text:
        if field1 in paragraph.text:
            extracted_text + = paragraph.text + "\\
"
            is_between_fields = True
        ####
        elif field2 in paragraph.text:
            extracted_text + = paragraph.text + "\\
"
            is_between_fields = False
        elif is_between_fields:
            extracted_text + = paragraph.text + "\\
"


    mydb = mysql.connector.connect(
        host="localhost", # database host address
        user="yourusername", # database username
        passwd="yourpassword" # database password
    )

    print(mydb)
    ####
    # Create Excel workbooks and worksheets
    try:
        workbook = xlwt. Workbook()
        worksheet = workbook.add_sheet('Extracted Text')

        # Save the extracted text to a cell in an Excel sheet
        worksheet.write(0, 0, 'Extracted Text')
        worksheet. write(1, 0, extracted_text)

        # save the excel sheet
        workbook. save(output_path)
        print("Excel table has been saved to",output_path)

    except Exception as e:
        print("An error occurred while creating the Excel table", str(e))
'''

def extract_text_between_fields(docx_path, field1, field2, output_path):
    doc = Document(docx_path)
    extracted_text = []
    is_between_fields = False

    for paragraph in doc.paragraphs:
        #if field1 in paragraph.text and field2 in paragraph.text:
        if field1 in paragraph.text:
            extracted_text.append(paragraph.text)
            is_between_fields = True
        '''elif field2 in paragraph.text:
            extracted_text + = paragraph.text + "\\
"
            is_between_fields = False
        elif is_between_fields:
            extracted_text + = paragraph.text + "\\
"


    mydb = mysql.connector.connect(
        host="localhost", # database host address
        user="yourusername", # database username
        passwd="yourpassword" # database password
    )

    print(mydb)
    '''
    # Create Excel workbooks and worksheets
    try:
        workbook = xlwt. Workbook()
        worksheet = workbook.add_sheet('Extracted Text')

        # Save the extracted text to a cell in an Excel sheet
        for i, text in enumerate(extracted_text):
            worksheet. write(i, 0, text)

        # save the excel sheet
        workbook. save(output_path)
        print("Excel table has been saved to",output_path)

    except Exception as e:
        print("An error occurred while creating the Excel table", str(e))


# use example
def main():
    docx_path = "D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.docx" # Replace with the actual Word document path
    field1 = "[SWS_Os_" # Replace with the ID of field 1
    field2 = "?" # Replace with the ID of field 2
    output_path = "D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS_1.1.xls" # Replace with the output Excel table path

    extract_text_between_fields(docx_path, field1, field2, output_path)
def print_hi(name):
    # Use a breakpoint in the code line below to debug your script.
    print(f'Hi, {name}') # Press Ctrl + F8 to toggle the breakpoint.


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    print_hi('PyCharm')
    main()

# See PyCharm help at https://www.jetbrains.com/help/pycharm/