Digital modeling experience-data processing-pandas

Digital analog experience-data processing-pandas

Detailed explanation of the code: will be added next time

import pandas as pd
import numpy as np
# # Set panda display function
# pd.set_option('display.max_columns', 10)
# pd.set_option('display.max_rows', 100)
# pd.set_option('display.width', 100)

Series basic operations

obj=pd.Series([4,7,-5,3])
obj
0 4
1 7
2-5
3 3
dtype: int64
obj.values
array([ 4, 7, -5, 3], dtype=int64)
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2=pd.Series([4,7,-5,3],index=['d', 'b', 'a', 'c'])
obj2
d 4
b 7
a-5
c 3
dtype: int64
obj2.index
Index(['d', 'b', 'a', 'c'], dtype='object')
obj2["a"]
-5
obj2[['c', 'a', 'd']]
c 3
a-5
d 4
dtype: int64
obj2[obj2>0]
d 4
b 7
c 3
dtype: int64
obj2 * 2
d 8
b 14
a-10
c 6
dtype: int64
np.exp(obj2)
d 54.598150
b 1096.633158
a 0.006738
c 20.085537
dtype: float64
"b" in obj2
True
4 in obj2.values
True
# Create Series through dictionary
sdata = {<!-- -->'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
states =['California', 'Ohio', 'Oregon', 'Texas']
obj4 =pd.Series(sdata,index=states)
obj4
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
pd.isnull(obj4)
California True
Ohio False
Oregon False
Texas False
dtype: bool
pd.notnull(obj4)
California False
Ohio True
Oregon True
Texas True
dtype: bool
obj4.isnull()
California True
Ohio False
Oregon False
Texas False
dtype: bool
obj3
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
obj4
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
obj3 + obj4
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
obj4.name="population"
obj4.index.name="state"
obj4
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
obj
0 4
1 7
2-5
3 3
dtype: int64
obj.index= ['Bob','Steve','Jeff', 'Ryan']
obj
Bob 4
Steve 7
Jeff -5
Ryan 3
dtype: int64

“””Dataframe”””

""""Dataframe"""
'"Dataframe'
data = {<!-- -->'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
frame=pd.DataFrame(frame, columns=frame.columns.sort_values())
frame
pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
2 3.6 Ohio 2002
3 2.4 Nevada 2001
4 2.9 Nevada 2002
5 3.2 Nevada 2003
pd.DataFrame(data, columns=['year', 'state', 'pop'])
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
5 2003 Nevada 3.2
# Sort by column
frame=frame.sort_values(by='year')
frame
pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
3 2.4 Nevada 2001
2 3.6 Ohio 2002
4 2.9 Nevada 2002
5 3.2 Nevada 2003
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                         index=['one', 'two', 'three', 'four',
                        'five', 'six'])
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
frame2["state"]
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
six Nevada
Name: state, dtype: object
frame.year
0 2000
1 2001
3 2001
2 2002
4 2002
5 2003
Name: year, dtype: int64
frame2.loc['three'].values
array([2002, 'Ohio', 3.6, nan], dtype=object)
# Add columns, matching adding
frame2['debt'] = 16.5
frame2['debt'] = np.arange(6.)
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
six 2003 Nevada 3.2 NaN
frame2["eastern"]=frame2.state == 'Ohio'
frame2
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False
six 2003 Nevada 3.2 NaN False
# Delete column
del frame2['eastern']
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
six 2003 Nevada 3.2 NaN
frame2.iloc[0:3,0:2]
year state
one 2000 Ohio
two 2001 Ohio
three 2002 Ohio
frame2.loc["one":"three","year":"state"]
year state
one 2000 Ohio
two 2001 Ohio
three 2002 Ohio
ser = pd.Series(np.arange(3.))
ser[-1]
-------------------------------------------------- ----------------------------

ValueError Traceback (most recent call last)

File d:\Anaconda3\envs\guoguo\lib\site-packages\pandas\core\indexes\range.py:391, in RangeIndex.get_loc(self, key, method, tolerance)
    390 try:
--> 391 return self._range.index(new_key)
    392 except ValueError as err:


ValueError: -1 is not in range

?

The above exception was the direct cause of the following exception:


KeyError Traceback (most recent call last)

d:\ai_py_3.9\Common code\pfda\chapter5.ipynb Cell 38 line 2
      <a href='vscode-notebook-cell:/d:/ai_py_3.9/common code/pfda/chapter5.ipynb#X51sZmlsZQ==?line=0'>1</a> ser = pd.Series( np.arange(3.))
----> <a href='vscode-notebook-cell:/d:/ai_py_3.9/common code/pfda/chapter5.ipynb#X51sZmlsZQ==?line=1'>2</a> ser [-1]


File d:\Anaconda3\envs\guoguo\lib\site-packages\pandas\core\series.py:981, in Series.__getitem__(self, key)
    978 return self._values[key]
    980 elif key_is_scalar:
--> 981 return self._get_value(key)
    983 if is_hashable(key):
    984 # Otherwise index.get_value will raise InvalidIndexError
    985 try:
    986 # For labels that don't resolve as scalars like tuples and frozensets


File d:\Anaconda3\envs\guoguo\lib\site-packages\pandas\core\series.py:1089, in Series._get_value(self, label, takeable)
   1086 return self._values[label]
   1088 # Similar to Index.get_value, but we do not fall back to positional
-> 1089 loc = self.index.get_loc(label)
   1090 return self.index._get_values_for_loc(self, loc, label)


File d:\Anaconda3\envs\guoguo\lib\site-packages\pandas\core\indexes\range.py:393, in RangeIndex.get_loc(self, key, method, tolerance)
    391 return self._range.index(new_key)
    392 except ValueError as err:
--> 393 raise KeyError(key) from err
    394 self._check_indexing_error(key)
    395 raise KeyError(key)


KeyError: -1
ser2=pd.Series(np.arange(3.),index=['a','b','c'])
ser2[-1]

2.0
s1=pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2=pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g '])
s1
a 7.3
c-2.5
d 3.4
e 1.5
dtype: float64
s2
a -2.1
c 3.6
e-1.5
f 4.0
g 3.1
dtype: float64
s1 + s2
a 5.2
c 1.1
dNaN
e 0.0
fNaN
gNaN
dtype: float64
df1=pd.DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['Ohio','Texas\ ','Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),index=['Utah', 'Ohio', \ 'Texas', 'Oregon'])
df1
b c d
Ohio 0.0 1.0 2.0
Texas 3.0 4.0 5.0
Colorado 6.0 7.0 8.0
df2
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
df1 + df2
b c d e
Colorado NaN NaN NaN NaN
Ohio 3.0 NaN 6.0 NaN
Oregon NaN NaN NaN NaN
Texas 9.0 NaN 12.0 NaN
Utah NaN NaN NaN NaN
df1 = pd.DataFrame({<!-- -->'A': [1, 2]})

df2 = pd.DataFrame({<!-- -->'B': [3, 4]})
df1

< /table>

df2
A
0 1
1 2

< /table>

df1-df2
B
0 3
1 4
A B
0 NaN NaN
1 NaN NaN
list("afs")
['a', 'f', 's']