# This is a sample Python script.
# Press Shift + F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
”’
import pandas as pd
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams, LTTextBoxHorizontal
# The path of the PDF file that needs to be parsed
pdf_file_path = ‘D:\SVN\kernel_project\Product Management\Product Technical Specifications\Autosar\CP\R4.2.2\AUTOSAR_SWS_OS.pdf’
def main():
# Open the PDF file and parse the content
with open(pdf_file_path, ‘rb’) as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# list for saving PDF content
content_list = []
# Traverse the PDF pages, looking for specific fields
for page in PDFPage.create_pages(doc):
interpreter. process_page(page)
layout = device. get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBoxHorizontal):
# Here you need to set the fields to be searched and the corresponding conditions according to the actual situation
if ‘[SWS_Os’ in lt_obj.get_text() and ‘? ( )’ in lt_obj.get_text():
# Split the matched fields by spaces, and remove redundant spaces and newlines
fields = [field.strip() for field in lt_obj.get_text().split(‘ ‘) if field.strip()]
# add the field to the content list
content_list.append(fields)
# Save the content to an Excel table
df = pd. DataFrame(content_list)
df.to_excel(‘output.xlsx’, index=False, header=False)
”’
”’
from docx import Document
import xlwt
def extract_fields_from_word(docx_path, fields, output_path):
doc = Document(docx_path)
# Create Excel workbooks and worksheets
workbook = xlwt. Workbook()
worksheet = workbook.add_sheet(‘Extracted Fields’)
# Set Excel table header
worksheet.write(0, 0, ‘Word File’)
for row, field in enumerate(fields):
worksheet. write(row + 1, 0, field)
# Extract field content and save to Excel sheet
for col, field in enumerate(fields):
worksheet. write(0, col + 1, field)
for paragraph in doc.paragraphs:
if field in paragraph.text:
extracted_text = paragraph.text.replace(field, “”).strip()
worksheet.write(fields.index(field) + 1, col + 1, extracted_text)
# save the excel sheet
workbook. save(output_path)
# use example
def main():
docx_path = “D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.docx” # Replace with the actual Word document path
fields = [‘[SWS_Os’, ‘? ( )’] # replace with a list of fields to extract
output_path = “D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.xls” # Replace with the output Excel table path
extract_fields_from_word(docx_path, fields, output_path)
”’
import mysql.connector
import xlwt
from docx import Document
”’
def extract_text_between_fields(docx_path, field1, field2, output_path):
doc = Document(docx_path)
extracted_text = “”
is_between_fields = False
for paragraph in doc.paragraphs:
#if field1 in paragraph.text and field2 in paragraph.text:
if field1 in paragraph.text:
extracted_text + = paragraph.text + “\\
“
is_between_fields = True
####
elif field2 in paragraph.text:
extracted_text + = paragraph.text + “\\
“
is_between_fields = False
elif is_between_fields:
extracted_text + = paragraph.text + “\\
“
mydb = mysql.connector.connect(
host=”localhost”, # database host address
user=”yourusername”, # database username
passwd=”yourpassword” # database password
)
print(mydb)
####
# Create Excel workbooks and worksheets
try:
workbook = xlwt. Workbook()
worksheet = workbook.add_sheet(‘Extracted Text’)
# Save the extracted text to a cell in an Excel sheet
worksheet.write(0, 0, ‘Extracted Text’)
worksheet. write(1, 0, extracted_text)
# save the excel sheet
workbook. save(output_path)
print(“Excel table has been saved to”,output_path)
except Exception as e:
print(“An error occurred while creating the Excel table”, str(e))
”’
def extract_text_between_fields(docx_path, field1, field2, output_path):
doc = Document(docx_path)
extracted_text = []
is_between_fields = False
for paragraph in doc.paragraphs:
#if field1 in paragraph.text and field2 in paragraph.text:
if field1 in paragraph.text:
extracted_text.append(paragraph.text)
is_between_fields = True
”’elif field2 in paragraph.text:
extracted_text + = paragraph.text + “\\
“
is_between_fields = False
elif is_between_fields:
extracted_text + = paragraph.text + “\\
“
mydb = mysql.connector.connect(
host=”localhost”, # database host address
user=”yourusername”, # database username
passwd=”yourpassword” # database password
)
print(mydb)
”’
# Create Excel workbooks and worksheets
try:
workbook = xlwt. Workbook()
worksheet = workbook.add_sheet(‘Extracted Text’)
# Save the extracted text to a cell in an Excel sheet
for i, text in enumerate(extracted_text):
worksheet. write(i, 0, text)
# save the excel sheet
workbook. save(output_path)
print(“Excel table has been saved to”,output_path)
except Exception as e:
print(“An error occurred while creating the Excel table”, str(e))
# use example
def main():
docx_path = “D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.docx” # Replace with the actual Word document path
field1 = “[SWS_Os_” # Replace with the ID of field 1
field2 = “?” # Replace with the ID of field 2
output_path = “D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS_1.1.xls” # Replace with the output Excel table path
extract_text_between_fields(docx_path, field1, field2, output_path)
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f’Hi, {name}’) # Press Ctrl + F8 to toggle the breakpoint.
# Press the green button in the gutter to run the script.
if __name__ == ‘__main__’:
print_hi(‘PyCharm’)
main()
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
# This is a sample Python script. # Press Shift + F10 to execute it or replace it with your code. # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. ''' import pandas as pd from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfpage import PDFPage from pdfminer.layout import LAParams, LTTextBoxHorizontal # The path of the PDF file that needs to be parsed pdf_file_path = 'D:\SVN\kernel_project\Product Management\Product Technical Specifications\Autosar\CP\R4.2.2\AUTOSAR_SWS_OS.pdf' def main(): # Open the PDF file and parse the content with open(pdf_file_path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) # list for saving PDF content content_list = [] # Traverse the PDF pages, looking for specific fields for page in PDFPage.create_pages(doc): interpreter. process_page(page) layout = device. get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBoxHorizontal): # Here you need to set the fields to be searched and the corresponding conditions according to the actual situation if '[SWS_Os' in lt_obj.get_text() and '? ( )' in lt_obj.get_text(): # Split the matched fields by spaces, and remove redundant spaces and newlines fields = [field.strip() for field in lt_obj.get_text().split(' ') if field.strip()] # add the field to the content list content_list.append(fields) # Save the content to an Excel table df = pd. DataFrame(content_list) df.to_excel('output.xlsx', index=False, header=False) ''' ''' from docx import Document import xlwt def extract_fields_from_word(docx_path, fields, output_path): doc = Document(docx_path) # Create Excel workbooks and worksheets workbook = xlwt. Workbook() worksheet = workbook.add_sheet('Extracted Fields') # Set Excel table header worksheet.write(0, 0, 'Word File') for row, field in enumerate(fields): worksheet. write(row + 1, 0, field) # Extract field content and save to Excel sheet for col, field in enumerate(fields): worksheet. write(0, col + 1, field) for paragraph in doc.paragraphs: if field in paragraph.text: extracted_text = paragraph.text.replace(field, "").strip() worksheet.write(fields.index(field) + 1, col + 1, extracted_text) # save the excel sheet workbook. save(output_path) # use example def main(): docx_path = "D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.docx" # Replace with the actual Word document path fields = ['[SWS_Os', '? ( )'] # replace with a list of fields to extract output_path = "D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.xls" # Replace with the output Excel table path extract_fields_from_word(docx_path, fields, output_path) ''' import mysql.connector import xlwt from docx import Document ''' def extract_text_between_fields(docx_path, field1, field2, output_path): doc = Document(docx_path) extracted_text = "" is_between_fields = False for paragraph in doc.paragraphs: #if field1 in paragraph.text and field2 in paragraph.text: if field1 in paragraph.text: extracted_text + = paragraph.text + "\\ " is_between_fields = True #### elif field2 in paragraph.text: extracted_text + = paragraph.text + "\\ " is_between_fields = False elif is_between_fields: extracted_text + = paragraph.text + "\\ " mydb = mysql.connector.connect( host="localhost", # database host address user="yourusername", # database username passwd="yourpassword" # database password ) print(mydb) #### # Create Excel workbooks and worksheets try: workbook = xlwt. Workbook() worksheet = workbook.add_sheet('Extracted Text') # Save the extracted text to a cell in an Excel sheet worksheet.write(0, 0, 'Extracted Text') worksheet. write(1, 0, extracted_text) # save the excel sheet workbook. save(output_path) print("Excel table has been saved to",output_path) except Exception as e: print("An error occurred while creating the Excel table", str(e)) ''' def extract_text_between_fields(docx_path, field1, field2, output_path): doc = Document(docx_path) extracted_text = [] is_between_fields = False for paragraph in doc.paragraphs: #if field1 in paragraph.text and field2 in paragraph.text: if field1 in paragraph.text: extracted_text.append(paragraph.text) is_between_fields = True '''elif field2 in paragraph.text: extracted_text + = paragraph.text + "\\ " is_between_fields = False elif is_between_fields: extracted_text + = paragraph.text + "\\ " mydb = mysql.connector.connect( host="localhost", # database host address user="yourusername", # database username passwd="yourpassword" # database password ) print(mydb) ''' # Create Excel workbooks and worksheets try: workbook = xlwt. Workbook() worksheet = workbook.add_sheet('Extracted Text') # Save the extracted text to a cell in an Excel sheet for i, text in enumerate(extracted_text): worksheet. write(i, 0, text) # save the excel sheet workbook. save(output_path) print("Excel table has been saved to",output_path) except Exception as e: print("An error occurred while creating the Excel table", str(e)) # use example def main(): docx_path = "D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS.docx" # Replace with the actual Word document path field1 = "[SWS_Os_" # Replace with the ID of field 1 field2 = "?" # Replace with the ID of field 2 output_path = "D:\WorkspaceTZX\Internship Work\AUTOSAR_SWS_OS_1.1.xls" # Replace with the output Excel table path extract_text_between_fields(docx_path, field1, field2, output_path) def print_hi(name): # Use a breakpoint in the code line below to debug your script. print(f'Hi, {name}') # Press Ctrl + F8 to toggle the breakpoint. # Press the green button in the gutter to run the script. if __name__ == '__main__': print_hi('PyCharm') main() # See PyCharm help at https://www.jetbrains.com/help/pycharm/