python – processing excel spreadsheets – openpyxl module

Foreword

openpyxl module: can read and modify Excel spreadsheet files

  • Software name and software download address
    LibreOffice Calc
    Apache OpenOffice – Official Site – The Free and Open Productivity Suite

    OpenOffice Calc
    Home | LibreOffice – Free Office Suite – Based on OpenOffice – Compatible with Microsoft

An Excel spreadsheet document is called a workbook – – – has the extension .xlsx
Each workbook can contain multiple sheets (also known as worksheets)
The sheet that the user is currently viewing (or was last viewing before closing Excel), known as the active sheet

1. Install the openpyxl module

pip install openpyxl

openpyxl – A Python library to read/write Excel 2010 xlsx/xlsm files

2. Read Excel documents

1. Open Excel document with openpyxl module openpyxl.load_workbook()function

import openpyxl

# open Excel file
wb = openpyxl.load_workbook('example.xlsx')
type(wb) # <class 'openpyxl.workbook.workbook.Workbook'>

2. Get the name of the worksheet from the workbook sheetnames

wb.sheetnames # ['Sheet1', 'Sheet2', 'Sheet3']
# select worksheet
sheet = wb['Sheet3']
print(sheet) # <Worksheet "Sheet3">
type(sheet) # <class 'openpyxl.worksheet.worksheet.Worksheet'>
sheet.title # 'Sheet3'
anotherSheet = wb.active # active sheet
anotherSheet # <Worksheet "Sheet1">

3. Get cells from the table

Once you have the Worksheet object, you can access the Cell object by name

row row, column column and coordinate coordinate attribute, which provides the location information of the cell

import openpyxl
wb = openpyxl.load_workbook('example.xlsx')
sheet = wb.get_sheet_by_name('Sheet1')
sheet['A1'] # <Cell Sheet1.A1>
sheet['A1'].value # datetime.datetime(2015, 4, 5, 13, 34, 2)
c = sheet['B1']
c. value # 'Apples'
'Row ' + str(c.row) + ', Column ' + str(c.column) + ' is ' + c.value # 'Row 1, Column 2 is Apples'
'Cell ' + c. coordinate + ' is ' + c. value # 'Cell B1 is Apples'
sheet['C1'].value # 73
sheet.cell(row=1, column=2) # <Cell Sheet1.B1>

for i in range(1, 8, 2):
print(i, sheet. cell(row=i, column=2). value)
#1 Apples
#3 Pears
#5 Apples
#7 Strawberries

max_row gets the total number of rows in the worksheet, max_column the total number of columns

sheet.max_row # 7
sheet.max_column # 3

4. Conversion between column letters and numbers

import openpyxl
import openpyxl.utils

# convert column letter to numeric index
col_letter = 'D'
col_num = openpyxl.utils.column_index_from_string(col_letter)
print(f"{<!-- -->col_letter} corresponds to the number index: {<!-- -->col_num}") # D corresponds to the number index: 4

# Numeric indices are converted to column letters
col_num = 4
col_letter = openpyxl.utils.get_column_letter(col_num)
print(f"{<!-- -->col_num} corresponds to the column letter: {<!-- -->col_letter}") # 4 corresponds to the column letter: D
from openpyxl.utils import column_index_from_string,get_column_letter
column_index_from_string('aa') # 27

5. Get rows and columns from the table

You can slice the Worksheet object to get all the Cell objects in a row, a column, or a rectangular area in a spreadsheet. You can then loop through all the cells in this slice.

tuple(sheet['A1':'C3'])
# ((<Cell 'Sheet1'.A1>, <Cell 'Sheet1'.B1>, <Cell 'Sheet1'.C1>),
# (<Cell 'Sheet1'.A2>, <Cell 'Sheet1'.B2>, <Cell 'Sheet1'.C2>),
# (<Cell 'Sheet1'.A3>, <Cell 'Sheet1'.B3>, <Cell 'Sheet1'.C3>))

for rowOfCellObjects in sheet['A1':'C3']:
for cellObj in rowOfCellObjects:
print(cellObj. coordinate, cellObj. value)
print('--- END OF ROW ---')
#A1 2015-04-05 13:34:02
#B1 Apples
# C1 73
# --- END OF ROW ---
# -- snip --

sheet.columns returns a generator object, not a list, so it cannot be indexed like a list
If you want to view the cell content of the first column, you can use the iter_cols() method. The parameters of this method can specify the range of the column, for example: min_col=1, max_col=1 means the first column.

import openpyxl
wb = openpyxl.load_workbook('example.xlsx')
sheet = wb.active
for cellObj in sheet.iter_cols(min_col=2, max_col=2): # Column 2
    for cell in cellObj:
        print(cell. value)
#apples
#Cherries
#Pears
#Oranges
#apples
#Bananas
#Strawberries

The parameters of sheet.iter_cols are min_row: starting row number; max_row: ending row number; min_col: starting column number; max_col: ending column number; values_only: bool type, the default is False, if it is True, means to return only the value of the cell without any formatting information.

3. Read data from spreadsheet

1. Read spreadsheet data

Census tract number (A), state abbreviation (B), county name (C), census tract population (D)

import openpyxl,pprint
wb = openpyxl.load_workbook('censuspopdata.xlsx')
sheet = wb['Population by Census Tract']
countyData = {<!-- -->}
# range(2,sheet.max_row + 1) From the second row to the last row, because sheet.max_row is not included, so +1
for row in range(2, sheet. max_row + 1):
    State = sheet['B' + str(row)].value # state
    county = sheet['C' + str(row)].value # county
    pop = sheet['D' + str(row)].value # Population

2. Fill data structure

pop population tracts census area number

for row in range(2,sheet.max_row + 1):
    state = sheet['B' + str(row)].value
    county = sheet['C' + str(row)].value
    pop = sheet['D' + str(row)].value
    
    # Make sure the key for this state exists.
    countyData.setdefault(state, {<!-- -->})
    # Make sure the key for this county in this state exists.
    countyData[state].setdefault(county, {<!-- -->'tracts': 0, 'pop': 0})
    
    # Each row represents one census tract, so increment by one.
    countyData[state][county]['tracts'] + = 1
    # Increase the county pop by the pop in this census tract.
    countyData[state][county]['pop'] + = int(pop)

Add setdefault:
setdefault is used to add key-value pairs to the dictionary. It will add the key-value pair to the dictionary when the key doesn’t exist and it will update the value of the key when it exists
The syntax of setdefault: dictionary.setdefault(key, default_value)
key is the key to check or add, default_value is the value to assign to the key if it is not found in the dictionary

myDict = {<!-- -->'name': 'John', 'age': 26}
address = myDict.setdefault('address', 'Unknown')
print(address) # Output: Unknown
print(myDict) # output: {'name': 'John', 'age': 26, 'address': 'Unknown'}

city = myDict.setdefault('city', 'New York')
print(city) # output: New York
print(myDict) # output: {'name': 'John', 'age': 26, 'address': 'Unknown', 'city': 'New York '}

# If the key already exists, ignore the value assigned in setdefault
myDict.setdefault('name', 'Mike')
print(myDict) # output: {'name': 'John', 'age': 26, 'address': 'Unknown', 'city': 'New York '}

3. Write the result to a file

resultFile = open('census2010.py', 'w')
resultFile.write('allData = ' + pprint.pformat(countyData))
resultFile. close()
print('Done.')

4. Write Excel document

1. Create and save Excel document openpyxl.Workbook()

import openpyxl
# Create a new empty Workbook object
wb = openpyxl. Workbook()
wb.sheetnames # ['Sheet']
sheet = wb. active
sheet.title = 'asdf'
wb.sheetnames # ['asdf']
wb.save('asdf.xlsx') # save the file with asdf.xlsx as the file name

2. Create create_sheet() and delete remove_sheet() worksheet

wb.create_sheet() # Create a worksheet at the last position, the default Sheet, Sheet1, Sheet2...
wb.create_sheet(index=0, title='First Sheet') # Create a worksheet named First Sheet at the front

wb.remove_sheet(wb['Sheet']) # Delete Sheet
del wb['asdf'] # delete asdf worksheet

3. Write the value into the cell

sheet=wb.active
sheet['A1'] = 'Hello World'
sheet['A1'].value # 'Hello World'

5. Update a spreadsheet

Type of product sold (A), price per pound of product (B), pounds sold (C), total revenue from sales

1. Create a data structure with updated information

import openpyxl
wb = openpyxl.load_workbook('produceSales.xlsx')
sheet = wb['Sheet']
PRICE_UPDATES = {<!-- -->'Garlic': 3.07,
                 'Celery': 1.19,
                 'Lemon': 1.27}

2. Check all rows, update incorrect prices

for rowNum in range(2, sheet.max_row + 1): # skip the first row
    produceName = sheet.cell(row=rowNum, column=1).value
    if produceName in PRICE_UPDATES:
        sheet.cell(row=rowNum, column=2).value = PRICE_UPDATES[produceName]
        
wb.save('updatedProduceSales.xlsx')

6. Set the font style of the cell Font object

import openpyxl
from openpyxl. styles import Font, NamedStyle

wb = openpyxl. Workbook()
sheet = wb.active

italic24Font = Font(size=30, italic=False,bold=True,color="ff0000",name='Calibri')
# name font name eg: 'Times New Roman' default font Calibri default size 11
sheet['A1'].font = italic24Font

sheet['A1'] = 'Hello world!'
wb.save('styled.xlsx')

7. Formula

sheet['B9'] = '=SUM(B1:B8)'

sheet['A1'] = 200
sheet['A2'] = 300
sheet['A3'] = '=SUM(A1:A2)'

data_only = True

port openpyxl
>>> wbFormulas = openpyxl.load_workbook('writeFormula.xlsx')
>>> sheet = wbFormulas. get_active_sheet()
>>> sheet['A3'].value
'=SUM(A1:A2)'
>>> wbDataOnly = openpyxl.load_workbook('writeFormula.xlsx', data_only=True)
>>> sheet = wbDataOnly. get_active_sheet()
>>> sheet['A3'].value
500

8. Adjust rows and columns

1. Set row height and column width

The Worksheet object has row_dimensions and column_dimensions attributes to control row height and column width

The row height can be set as an integer or float value between 0 and 409
A point equals 1/72 of an inch, and the default line height is 12.75

Column width can be set as an integer or float between 0 and 255. This value indicates that when using the default font size (11 point), the cell can be displayed
number of characters displayed. The default column width is 8.43 characters. A column width of zero or a row height of zero will make the cell hidden.

import openpyxl
wb = openpyxl. Workbook()
sheet = wb.active
sheet['A1'] = 'Tall row'
sheet['B2'] = 'Wide column'
sheet.row_dimensions[1].height = 70
sheet.column_dimensions['B'].width = 20
wb.save('dimensions.xlsx')

2. Merge and split cells

merge_cells() worksheet method, merge the cells in a rectangular area into one cell

import openpyxl
wb = openpyxl. Workbook()
sheet = wb.active
sheet. merge_cells('A1:D3')
sheet['A1'] = 'Twelve cells merged together.'
sheet.merge_cells('C5:D5')
sheet['C5'] = 'Two merged cells.'
wb.save('merged.xlsx')

To split cells, call the unmerge_cells() worksheet method

sheet.unmerge_cells('A1:D3')
sheet. unmerge_cells('C5:D5')

3. Freeze pane

freeze_panes attribute

In OpenPyXL, each Worksheet object has a freeze_panes property that can be set to a Cell object or a string of cell coordinates. Note that all rows above and all columns to the left of the cell are frozen, but the row and column the cell is in are not.

To unfreeze all cells, set freeze_panes to None or ‘A1’.

sheet.freeze_panes = 'A2' # freeze column A
sheet.freeze_panes = 'C2' # freeze row 1 and column A and column B
wb.save('freezeExample.xlsx')

4. Chart

Create bar, line, scatter, and pie charts

import random
from openpyxl import Workbook
from openpyxl.chart import (
    BarChart,
    reference,
    Series,
)

# Create a Workbook object
wb = Workbook()

# select a worksheet
ws = wb.active

# edit data
for i in range(1, 6):
    # The first line is the column headers
    ws.cell(row=1, column=i).value = f"{<!-- -->i} month"
    # Fill the data, randomly generate an integer between 1 and 100 as sales
    for j in range(2, 7):
        ws.cell(row=j, column=i).value = random.randint(1, 100)

# Create a histogram
chart = BarChart()

# Set the title and style of the chart
chart.title = "Sales"
chart.style = 10

# Get the range of data
data = Reference(ws, min_col=1, min_row=1, max_col=5, max_row=6)

# Add data to the chart
chart.add_data(data, titles_from_data=True)

# add x-axis labels
cats = Reference(ws, min_col=1, min_row=2, max_row=6)
chart.set_categories(cats)

# add the chart to the worksheet
ws.add_chart(chart, "A8")

# save the Excel file
wb.save("sales.xlsx")

>>> import openpyxl
>>> wb = openpyxl. Workbook()
>>> sheet = wb. get_active_sheet()
>>> for i in range(1, 11): # create some data in column A
 sheet['A' + str(i)] = i
>>> refObj = openpyxl.charts.Reference(sheet, (1, 1), (10, 1))
>>> seriesObj = openpyxl.charts.Series(refObj, title='First series')
>>> chartObj = openpyxl.charts.BarChart()
>>> chartObj.append(seriesObj)
>>> chartObj.drawing.top = 50 # set the position
>>> chartObj.drawing.left = 100
>>> chartObj.drawing.width = 300 # set the size
>>> chartObj.drawing.height = 200
>>> sheet.add_chart(chartObj)
>>> wb.save('sampleChart.xlsx')

openpyxl.charts.BarChart(), create a bar chart
openpyxl.charts.LineChart(), openpyxl.charts.ScatterChart(), and openpyxl.charts.PieChart(), create line, scatter, and pie charts.