Digital modeling experience-data processing-pandas

Digital analog experience-data processing-pandas

Detailed explanation of the code: will be added next time

import pandas as pd
import numpy as np

# # Set panda display function
# pd.set_option('display.max_columns', 10)
# pd.set_option('display.max_rows', 100)
# pd.set_option('display.width', 100)

Series basic operations

obj=pd.Series([4,7,-5,3])
obj

0 4
1 7
2-5
3 3
dtype: int64

obj.values

array([ 4, 7, -5, 3], dtype=int64)

obj.index

RangeIndex(start=0, stop=4, step=1)

obj2=pd.Series([4,7,-5,3],index=['d', 'b', 'a', 'c'])
obj2

d 4
b 7
a-5
c 3
dtype: int64

obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

obj2["a"]

-5

obj2[['c', 'a', 'd']]

c 3
a-5
d 4
dtype: int64

obj2[obj2>0]

d 4
b 7
c 3
dtype: int64

obj2 * 2

d 8
b 14
a-10
c 6
dtype: int64

np.exp(obj2)

d 54.598150
b 1096.633158
a 0.006738
c 20.085537
dtype: float64

"b" in obj2

True

4 in obj2.values

True

# Create Series through dictionary
sdata = {<!-- -->'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64

states =['California', 'Ohio', 'Oregon', 'Texas']
obj4 =pd.Series(sdata,index=states)
obj4

California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64

pd.isnull(obj4)

California True
Ohio False
Oregon False
Texas False
dtype: bool

pd.notnull(obj4)

California False
Ohio True
Oregon True
Texas True
dtype: bool

obj4.isnull()

California True
Ohio False
Oregon False
Texas False
dtype: bool

obj3

Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64

obj4

California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64

obj3 + obj4

California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64

obj4.name="population"
obj4.index.name="state"
obj4

state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64

obj

0 4
1 7
2-5
3 3
dtype: int64

obj.index= ['Bob','Steve','Jeff', 'Ryan']
obj

Bob 4
Steve 7
Jeff -5
Ryan 3
dtype: int64

“””Dataframe”””

""""Dataframe"""

'"Dataframe'

data = {<!-- -->'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

	state	year	pop
0	Ohio	2000	1.5
1	Ohio	2001	1.7
2	Ohio	2002	3.6
3	Nevada	2001	2.4
4	Nevada	2002	2.9
5	Nevada	2003	3.2

frame=pd.DataFrame(frame, columns=frame.columns.sort_values())
frame

	pop	state	year
0	1.5	Ohio	2000
1	1.7	Ohio	2001
2	3.6	Ohio	2002
3	2.4	Nevada	2001
4	2.9	Nevada	2002
5	3.2	Nevada	2003

pd.DataFrame(data, columns=['year', 'state', 'pop'])

	year	state	pop
0	2000	Ohio	1.5
1	2001	Ohio	1.7
2	2002	Ohio	3.6
3	2001	Nevada	2.4
4	2002	Nevada	2.9
5	2003	Nevada	3.2

# Sort by column
frame=frame.sort_values(by='year')
frame

	pop	state	year
0	1.5	Ohio	2000
1	1.7	Ohio	2001
3	2.4	Nevada	2001
2	3.6	Ohio	2002
4	2.9	Nevada	2002
5	3.2	Nevada	2003

frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                         index=['one', 'two', 'three', 'four',
                        'five', 'six'])
frame2

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	NaN
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	NaN
five	2002	Nevada	2.9	NaN
six	2003	Nevada	3.2	NaN

frame2["state"]

one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
six Nevada
Name: state, dtype: object

frame.year

0 2000
1 2001
3 2001
2 2002
4 2002
5 2003
Name: year, dtype: int64

frame2.loc['three'].values

array([2002, 'Ohio', 3.6, nan], dtype=object)

# Add columns, matching adding
frame2['debt'] = 16.5
frame2['debt'] = np.arange(6.)
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	-1.2
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	-1.5
five	2002	Nevada	2.9	-1.7
six	2003	Nevada	3.2	NaN

frame2["eastern"]=frame2.state == 'Ohio'
frame2

	year	state	pop	debt	eastern
one	2000	Ohio	1.5	NaN	True
two	2001	Ohio	1.7	-1.2	True
three	2002	Ohio	3.6	NaN	True
four	2001	Nevada	2.4	-1.5	False
five	2002	Nevada	2.9	-1.7	False
six	2003	Nevada	3.2	NaN	False

# Delete column
del frame2['eastern']
frame2

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	-1.2
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	-1.5
five	2002	Nevada	2.9	-1.7
six	2003	Nevada	3.2	NaN

frame2.iloc[0:3,0:2]

	year	state
one	2000	Ohio
two	2001	Ohio
three	2002	Ohio

frame2.loc["one":"three","year":"state"]

	year	state
one	2000	Ohio
two	2001	Ohio
three	2002	Ohio

ser = pd.Series(np.arange(3.))
ser[-1]

-------------------------------------------------- ----------------------------

ValueError Traceback (most recent call last)

File d:\Anaconda3\envs\guoguo\lib\site-packages\pandas\core\indexes\range.py:391, in RangeIndex.get_loc(self, key, method, tolerance)
    390 try:
--> 391 return self._range.index(new_key)
    392 except ValueError as err:


ValueError: -1 is not in range

The above exception was the direct cause of the following exception:


KeyError Traceback (most recent call last)

d:\ai_py_3.9\Common code\pfda\chapter5.ipynb Cell 38 line 2
      <a href='vscode-notebook-cell:/d:/ai_py_3.9/common code/pfda/chapter5.ipynb#X51sZmlsZQ==?line=0'>1</a> ser = pd.Series( np.arange(3.))
----> <a href='vscode-notebook-cell:/d:/ai_py_3.9/common code/pfda/chapter5.ipynb#X51sZmlsZQ==?line=1'>2</a> ser [-1]


File d:\Anaconda3\envs\guoguo\lib\site-packages\pandas\core\series.py:981, in Series.__getitem__(self, key)
    978 return self._values[key]
    980 elif key_is_scalar:
--> 981 return self._get_value(key)
    983 if is_hashable(key):
    984 # Otherwise index.get_value will raise InvalidIndexError
    985 try:
    986 # For labels that don't resolve as scalars like tuples and frozensets


File d:\Anaconda3\envs\guoguo\lib\site-packages\pandas\core\series.py:1089, in Series._get_value(self, label, takeable)
   1086 return self._values[label]
   1088 # Similar to Index.get_value, but we do not fall back to positional
-> 1089 loc = self.index.get_loc(label)
   1090 return self.index._get_values_for_loc(self, loc, label)


File d:\Anaconda3\envs\guoguo\lib\site-packages\pandas\core\indexes\range.py:393, in RangeIndex.get_loc(self, key, method, tolerance)
    391 return self._range.index(new_key)
    392 except ValueError as err:
--> 393 raise KeyError(key) from err
    394 self._check_indexing_error(key)
    395 raise KeyError(key)


KeyError: -1

ser2=pd.Series(np.arange(3.),index=['a','b','c'])
ser2[-1]

2.0

s1=pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2=pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g '])
s1

a 7.3
c-2.5
d 3.4
e 1.5
dtype: float64

s2

a -2.1
c 3.6
e-1.5
f 4.0
g 3.1
dtype: float64

s1 + s2

a 5.2
c 1.1
dNaN
e 0.0
fNaN
gNaN
dtype: float64

df1=pd.DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['Ohio','Texas\ ','Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),index=['Utah', 'Ohio', \ 'Texas', 'Oregon'])
df1

	b	c	d
Ohio	0.0	1.0	2.0
Texas	3.0	4.0	5.0
Colorado	6.0	7.0	8.0

df2

	b	d	e
Utah	0.0	1.0	2.0
Ohio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

df1 + df2

	b	c	d	e
Colorado	NaN	NaN	NaN	NaN
Ohio	3.0	NaN	6.0	NaN
Oregon	NaN	NaN	NaN	NaN
Texas	9.0	NaN	12.0	NaN
Utah	NaN	NaN	NaN	NaN

df1 = pd.DataFrame({<!-- -->'A': [1, 2]})

df2 = pd.DataFrame({<!-- -->'B': [3, 4]})
df1

< /table>

df2

	A
0	1
1	2

< /table>

df1-df2

	B
0	3
1	4

	A	B
0	NaN	NaN
1	NaN	NaN

list("afs")

['a', 'f', 's']