Machine Learning/논문\코드 정리

Numpy(matrix 생성) & Pandas(dataframe 생성)

WakaraNai 2021. 11. 17. 12:33
728x90
반응형

import

import numpy as np

 

matrix 생성 : np.array([]) , np.zeros(()), np.arange(strat, stop)

one_dimensional_array = np.array([1.2, 2.4, 3.5, 4.7, 6.1, 7.2, 8.3, 9.5])
print(one_dimensional_array)
#[1.2 2.4 3.5 4.7 6.1 7.2 8.3 9.5]

two_dimensional_array = np.array([[6, 5], [11, 7], [4, 8]])
print(two_dimensional_array)
#[[ 6  5]
# [11  7]
# [ 4  8]]

#######################

np.zeros(5)
#array([ 0.,  0.,  0.,  0.,  0.])

np.zeros((5,), dtype=int)
#array([0, 0, 0, 0, 0])

np.zeros((2, 1))
#array([[ 0.],
#       [ 0.]])

np.zeros((2,), dtype=[('x', 'i4'), ('y', 'i4')]) # custom dtype
#array([(0, 0), (0, 0)],
#      dtype=[('x', '<i4'), ('y', '<i4')])

########################

sequence_of_integers = np.arange(5, 12)
print(sequence_of_integers)
# [ 5  6  7  8  9 10 11]

 

+) random number matrix
random_integers_between_50_and_100 = np.random.randint(low=50, high=101, size=(6))
print(random_integers_between_50_and_100)
# [75 59 87 74 76 91]

random_floats_between_0_and_1 = np.random.random([6])
print(random_floats_between_0_and_1) 
# [0.77619074 0.60823013 0.24196195 0.53824909 0.07297021 0.03829798]



# Add some Noise to the dataset : between -2 and +2
noise = (np.random.random([15]) * 4) - 2
print(noise)
label = label + noise 
print(label)
'''
[ 1.64943486 -0.67351056  1.80170699 -0.82078229  1.58230154 -1.67931272
  1.44076759  0.79557979  0.04125786  0.11340113  0.97559808 -0.16299612
 -0.59453335 -0.45360663 -0.19260686]
[23.64943486 24.32648944 29.80170699 30.17921771 35.58230154 35.32068728
 41.44076759 43.79557979 46.04125786 49.11340113 52.97559808 54.83700388
 57.40546665 60.54639337 63.80739314]
 '''

 

matrix 연산

두 벡터 또는 행렬을 더하거나 빼려면 선형 대수에서 두 피연산자의 차원이 동일해야 합니다.

또한 두 벡터 또는 행렬을 곱하려는 경우 선형 대수는 피연산자의 차원 호환성에 엄격한 규칙을 적용합니다.

다행히도 NumPy는 브로드캐스팅이라는 트릭을 사용하여

선형 대수에 호환되는 차원으로 더 작은 피연산자를 확장합니다.

예를 들어, 다음 연산은 브로드캐스팅을 사용하여 이전 코드 셀에서 생성된 벡터의 모든 항목 값에 2.0을 추가합니다.

random_floats_between_2_and_3 = random_floats_between_0_and_1 + 2.0
print(random_floats_between_2_and_3)
# [2.77619074 2.60823013 2.24196195 2.53824909 2.07297021 2.03829798]

random_integers_between_150_and_300 = random_integers_between_50_and_100 * 3
print(random_integers_between_150_and_300)
# [225 177 261 222 228 273]

 

 

 


 

import

import numpy as np
import pandas as pd

 

dataframe from numpy array

# Create and populate a 5x2 NumPy array.
my_data = np.array([[0, 3], [10, 7], [20, 9], [30, 14], [40, 15]])

# Create a Python list that holds the names of the two columns.
my_column_names = ['temperature', 'activity']

# Create a DataFrame.
my_dataframe = pd.DataFrame(data=my_data, columns=my_column_names)

# Print the entire DataFrame
print(my_dataframe)
'''
   temperature  activity
0            0         3
1           10         7
2           20         9
3           30        14
4           40        15
'''

 

Add a new column

# Create a new column named adjusted.
my_dataframe["adjusted"] = my_dataframe["activity"] + 2

# Print the entire DataFrame
print(my_dataframe)
'''
   temperature  activity  adjusted
0            0         3         5
1           10         7         9
2           20         9        11
3           30        14        16
4           40        15        17
'''

 

Specifying a subset of a dataframe

head(), tail()

iloc, loc

slicing

column name indexing

print("Rows #0, #1, and #2:")
print(my_dataframe.head(3), '\n')

print("Row #2:")
print(my_dataframe.iloc[[2]], '\n')

print("Rows #1, #2, and #3:")
print(my_dataframe[1:4], '\n')

print("Column 'temperature':")
print(my_dataframe['temperature'])

'''
Rows #0, #1, and #2:
   temperature  activity  adjusted
0            0         3         5
1           10         7         9
2           20         9        11 

Row #2:
   temperature  activity  adjusted
2           20         9        11 

Rows #1, #2, and #3:
   temperature  activity  adjusted
1           10         7         9
2           20         9        11
3           30        14        16 

Column 'temperature':
0     0
1    10
2    20
3    30
4    40
Name: temperature, dtype: int64
'''

 

 

Ex) dataframe 생성 실습

1. 3x4

2. the columns are named Eleanor, Chidi, Tahani, and Jason

3. Each cell has a random integer between 0 and 100, inclusive.

4. Janet이라는 5번째 컬럼을 추가. 이는 Tahani Jason을 더한 값

#@title Double-click for a solution to Task 1.

# Create a Python list that holds the names of the four columns.
my_column_names = ['Eleanor', 'Chidi', 'Tahani', 'Jason']

# Create a 3x4 numpy array, each cell populated with a random integer.
my_data = np.random.randint(low=0, high=101, size=(3, 4))

# Create a DataFrame.
df = pd.DataFrame(data=my_data, columns=my_column_names)

# Print the entire DataFrame
print(df)


'''
  Eleanor  Chidi  Tahani  Jason
0       72     77      99      8
1       94     11      14     16
2       47     92      31     52
'''





# Print the value in row #1 of the Eleanor column.
print("\nSecond row of the Eleanor column: %d\n" % df['Eleanor'][1])

# Create a column named Janet whose contents are the sum
# of two other columns.
df['Janet'] = df['Tahani'] + df['Jason']

# Print the enhanced DataFrame
print(df)

'''
Second row of the Eleanor column: 94

   Eleanor  Chidi  Tahani  Jason  Janet
0       72     77      99      8    107
1       94     11      14     16     30
2       47     92      31     52     83
'''

 

 

 

Copy or  Reference of DataFrame

# Create a reference by assigning my_dataframe to a new variable.
print("Experiment with a reference:")
reference_to_df = df

# Print the starting value of a particular cell.
print("  Starting value of df: %d" % df['Jason'][1])
print("  Starting value of reference_to_df: %d\n" % reference_to_df['Jason'][1])
'''
Experiment with a reference:
  Starting value of df: 16
  Starting value of reference_to_df: 16
  '''

# Modify a cell in df.
df.at[1, 'Jason'] = df['Jason'][1] + 5
print("  Updated df: %d" % df['Jason'][1])
print("  Updated reference_to_df: %d\n\n" % reference_to_df['Jason'][1])
'''
  Updated df: 21
  Updated reference_to_df: 21
'''



# Create a true copy of my_dataframe
print("Experiment with a true copy:")
copy_of_my_dataframe = my_dataframe.copy()

# Print the starting value of a particular cell.
print("  Starting value of my_dataframe: %d" % my_dataframe['activity'][1])
print("  Starting value of copy_of_my_dataframe: %d\n" % copy_of_my_dataframe['activity'][1])

'''Experiment with a true copy:
  Starting value of my_dataframe: 7
  Starting value of copy_of_my_dataframe: 7
  '''


# Modify a cell in df.
my_dataframe.at[1, 'activity'] = my_dataframe['activity'][1] + 3
print("  Updated my_dataframe: %d" % my_dataframe['activity'][1])
print("  copy_of_my_dataframe does not get updated: %d" % copy_of_my_dataframe['activity'][1])
'''
 Updated my_dataframe: 10
  copy_of_my_dataframe does not get updated: 7
  '''
728x90
반응형