'Google/Photo' 카테고리의 글 목록

Google/Photo

텍스트 2024.09.04 1
이미지 2024.09.04
구글 포토(Google Photo) 사진 다운로드하기 (부분 or 일괄) 2020.12.27 1

텍스트

2024. 9. 4. 23:54

## 1. 라이브러리 임포트

# import numpy as np
# import pandas as pd

import numpy as np
import pandas as pd

## 2. 파일 읽어오기

# AI-HUB 감성 대화 말뭉치 활용하여 만든 데이터 읽어오기
# final_data = pd. [칸 채우기] ('https://github.com/ohgzone/file1/raw/main/aihub_coupus.csv' )

final_data = pd.read_csv('https://github.com/ohgzone/file1/raw/main/aihub_coupus.csv' )

# 데이터 확인하기
final_data.head()

# 총 51,630건
final_data.info()

## 3. 영문, 숫자, 특수문자 제거

# '문장' 컬럼의 내용중에 영문, 특수문자 있는지 확인 : 영문과 특수문자 존재 확인
# final_data[final_data['문장'].str.contains('[^가-힣 ]')].values[:10]

final_data[final_data['문장'].str.contains('[^가-힣 ]')].values[:10]

# '문장' 컬럼의 내용에서 숫자, 영문자, 특수문자등의 글자는 삭제처리
# final_data['문장'].replace('[^가-힣 ]','', regex=True) : 이렇게도 가능

# final_data['문장'] = final_data['문장'].str.replace('[^가-힣 ]','', regex=True)

final_data['문장'] = final_data['문장'].str.replace('[^가-힣 ]', '', regex= True)

# '문장' 컬럼의 내용에서 영문, 특수문자 없음 확인
# final_data['문장'][final_data['문장'].str.contains('[^가-힣 ]')].sum()

final_data['문장'][final_data['문장'].str.contains('[^가-힣 ]')].sum()

# 숫자, 영문자, 특수문자 등 제거후 데이터 확인하기.
final_data.head()

## 4. 전처리 : Null, 중복 제거

# final_data 어떤 컬럼과 내용으로 되어 있는지 다시 확인
final_data.tail()

# '문장' 컬럼의 내용을 양끝의 빈공간 삭제
# final_data['문장'] = final_data['문장'].str.strip()
final_data['문장'] =final_data['문장'].str.strip()

# 데이터 다시 확인
final_data.tail()

# Null 있는지 확인 : 없음
final_data.isnull().sum()

# 중복 데이터 있는지 확인 : 56건 중복 존재 확인

final_data['문장'].duplicated().sum()

# 중복 데이터 제거

final_data.drop_duplicates(subset=['문장'], inplace =True)

# 기존 51,630건 --> 이후 51,574건 : 56건 중복 삭제 확인
final_data.info()

## 5. Label 분포 확인

# label '감정' 분포 확인 : 총 6개이며, 고루게 분포 확인. 단 기쁨이 약간 부족해 보임
# final_data['감정']. [칸 채우기] ()
final_data['감정'].value_counts()

# plot Bar차트 그리기
final_data['감정'].value_counts().plot(kind='bar')

## 6. label 숫자로 인코딩

# 라벨와 클래스을 매핑 작업

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
final_data['감정'] = encoder.fit_transform(final_data['감정'])
encoder.classes_

final_data.tail()

## 7. X, Y 분리

# X, Y 분리
features = final_data['문장'].values
labels = final_data['감정'].values

features.shape, labels.shape

# features 내용 3개 출력
features[:3]

# print('이벤트 문자열 최대 길이 :{}'.format(max(len(l) for l in features)))
# print('이벤트 문자열 평균 길이 :{}'.format(sum(map(len, features))/len(features)))

print('even word max :{}'.format(max(len(l) for l in features)))
print ('event word avg :{}'.format(sum(map(len, features))/len(features)))

# 히스토그램을 보면 30~40 부근에 많이 몰려 있음 알수 있다.

plt.hist([len(l) for l in features], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

## 8. train set와 test set 분리

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size =0.2, stratify=labels , random_state= 41)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

# 샘플확인 , 라벨 확인
# {0: '불안', 1: '분노', 2: '상처', 3: '슬픔', 4: '당황', 5: '기쁨'}

x_train[:2], y_train[:2]

# 말뭉치를 TF-IDF로 변환하기
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
x_train_v = tfidf.fit_transform(x_train)
x_test_v = tfidf.transform(x_test)

# 각 라인의 각 단어에 대한 TF-IDF 값 표현
print(x_train_v)

# 학습데이터셋의 TF-IDF 매트릭스 확인하기 : 41259 라인, 47366 단어
x_train_v.shape

## 9. 머신러닝 모델링

# 학습하는데 Colab에서 4분 소요
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x_train_v, y_train)
rfc.score(x_test_v, y_test)

### 10. 예측해 보기

# 출력 결과 해석 : (0, 44327) 0.241660101642553
# 0 : 첫라인, 44327 : 단어에 맵핑된 번호, 0.241660101642553 : tf-idf 계산 값
print('검증 데이터 셋의 첫번째 TF-IDF: {}'.format(x_test_v[0]))

print(f'검증데이터셋의 첫번째 TF-IDF 역변환 : {tfidf.inverse_transform(x_test_v[:1])}')

# RandomForest 모델로 예측하기
predict = rfc.predict(x_test_v[:1])
predict, encoder.inverse_transform(predict)

 
 
 
 

# 배운 내용 정리
1. AI-HUB 감정말뭉치 데이터 읽어오고
2. 데이터 전처리 : 한글, 공백외의 영어, 숫자, 특수문자등 제거
3. TF-IDF 토큰나이져 활용하여 토큰화하고 문장을 숫자로 나열
4. 머신러닝 RandomForest 모델을 활용하여 감성분류 수행

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

# B. LTSM 모델 이용하여 Classification하기
+ AI-HUB 감정말뭉치 데이터를 가지고
+ LSTM 모델학습하여 감정 뷴류 보겠습니다.

 
 

## 1. 라이브러리 임포트

import numpy as np
import pandas as pd

## 2. 파일 읽어오기

# AI-HUB 감성 대화 말뭉치 활용하여 만든 데이터 읽어오기
final_data = pd.read_csv('https://github.com/ohgzone/file1/raw/main/aihub_coupus.csv' )

# 데이터 확인하기
final_data.head()

# 총 51,630건
final_data.info()

## 3. 영문, 숫자 특수문자 제거

# '문장' 컬럼의 내용중에 영문, 특수문자 있는지 확인 : 영문과 특수문자 존재 확인
final_data[final_data['문장'].str.contains('[^가-힣 ]')].values[:10]

# '문장' 컬럼의 내용에서 숫자, 영문자, 특수문자등의 글자는 삭제처리
final_data['문장'] = final_data['문장'].str.replace('[^가-힣 ]','', regex=True)

# '문장' 컬럼의 내용에서 영문, 특수문자 없음 확인
final_data['문장'][final_data['문장'].str.contains('[^가-힣 ]')].sum()

# 숫자, 영문자, 특수문자 등 제거후 데이터 확인하기.
final_data.head()

## 4. 전처리 : Null, 중복 제거

# final_data 어떤 컬럼과 내용으로 되어 있는지 다시 확인
final_data.tail()

# '문장' 컬럼의 내용을 양끝의 빈공간 삭제
final_data['문장'] = final_data['문장'].str.strip()

# 데이터 다시 확인
final_data.tail()

# Null 있는지 확인 : 없음
final_data.isnull().sum()

# 중복 데이터 있는지 확인 : 56건 중복 존재 확인
final_data['문장'].duplicated().sum()

# 중복 데이터 제거
final_data.drop_duplicates(subset=['문장'], inplace=True)

# 기존 51,630건 --> 이후 51,574건 : 56건 중복 삭제 확인
final_data.info()

## 5. Label 분포 확인

# label '감정' 분포 확인 : 총 6개이며, 고루게 분포 확인. 단 기쁨이 약간 부족해 보임
final_data['감정'].value_counts()

# plot Bar차트 그리기
final_data['감정'].value_counts().plot(kind='bar')

## 6. label 숫자로 인코딩

# 감정 리스트 만듬
list1 = final_data['감정'].value_counts().index.values
list1

# 라벨와 클래스을 매핑 작업
label2class = {}
class2label = {}
for cl, la in enumerate(list1):
 # print(i, j)
 label2class[la] = cl
 class2label[cl] = la

print(label2class)
print(class2label)

# '감정' 라벨링 수행
final_data['label'] = final_data['감정'].map(label2class)

final_data.tail()

## 7. X, Y 분리

# X, Y 분리
features = final_data['문장'].values
labels = final_data['label'].values

features.shape, labels.shape

# features 내용 3개 출력
features[:3]

print('이벤트 문자열 최대 길이 :{}'.format(max(len(l) for l in features)))
print('이벤트 문자열 평균 길이 :{}'.format(sum(map(len, features))/len(features)))

# 히스토그램을 보면 30~40 부근에 많이 몰려 있음 알수 있다.
plt.hist([len(s) for s in features], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

## 8. train set와 test set 분리

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, labels , test_size=0.2, stratify=labels, random_state=41)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

# 샘플확인 , 라벨 확인
# {0: '불안', 1: '분노', 2: '상처', 3: '슬픔', 4: '당황', 5: '기쁨'}

x_train[:2], y_train[:2]

## 9. 전체 문장에 대해 Tokenizing
+ 컴퓨터가 이해하기 위해 모든 단어를 숫자로 변환해야 함.
+ 단어 빈도수 따지지 않고 무조건 모든 단어 수용해서 진행

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizer 구현 : 단어 사전 만들기(fit_on_texts)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

# 단어에 대한 숫자 매핑
print(tokenizer.word_index)

# 반대로 숫자로 단어 매핑
print(tokenizer.index_word)

# 단어별 빈도수 확인
print(tokenizer.word_counts)

# 총 단어 갯수 : 47,646
max_words = len(tokenizer.index_word)
print(max_words)

## 10. texts_to_sequences : 문장을 숫자로 나열
- 빈도수 적은 단어 제외하는것 없이 모든 단어 포함해서 진행
- 그리고, 예를 들어 1번 등장하는 단어는 삭제하는 작업은 필요시 수행!!

# 문장을 숫자로 나열

x_train_seq= tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# 문장을 숫자로 변경후 갯수 확인
# x_train.shape, x_test.shape, y_train.shape, y_test.shape : ((41259,), (10315,), (41259,), (10315,))
print(len(x_train_seq), len(x_test_seq))

print(x_train[1:3])
print(x_train_seq[1:3])

## 11. Padding Sequence

# 문장의 최대 길이 파악 : 제일 긴 문장 seq 길이는 38개로 구성됨.
max(len(l) for l in x_train_seq)

# 모든 문장을 최대 문장 Seq 길이 38에 맞춘다.

x_train_pad = pad_sequences(x_train_seq, maxlen=38)
x_test_pad = pad_sequences(x_test_seq, maxlen=38)

# 문장 Seq 내용을 보니 잘 패딩되어 있음 확인
x_train_pad[:1]

# 문장 Seq 패딩의 shape 확인
x_train_pad.shape, x_test_pad.shape

## 12. LSTM 모델링

from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, SimpleRNN, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# 하이퍼 파라미터

max_words = 47646 + 1 # 총 단어 갯수 + padding 0 번호
max_len = 38 # 최대 문장 길이
embedding_dim = 32 # embedding 차원

# 모델 선언
model = Sequential()

# 단어를 의미있는 32 차원으로 Vector 변경(Embedding)
model.add(Embedding(max_words, embedding_dim, input_length=max_len))

# LSTM 모델

model.add(LSTM(16, return_sequences=True))
model.add(LSTM(16, return_sequences=True))
model.add(Flatten())
model.add(Dense(128, activation='swish'))
model.add(Dense(32, activation='swish'))
model.add(Dense(6, activation='softmax'))

# 모델 compile
model.compile(loss = 'sparse_categorical_crossentropy',
 optimizer = 'adam',
 metrics = ['accuracy'])
model.summary()

# 조기종료 콜백함수 정의(EarlyStopping)
es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

# 체크포인트 저장(ModelCheckpoint)
checkpoint_path = 'tmp_checkpoint.keras'
cp = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True)

# 모델 학습(fit)
history = model.fit(x_train_pad, y_train, epochs=50, batch_size=512,
 validation_split=0.2, verbose =1, callbacks=[es, cp])

epochs = range(1, len(history.history['accuracy']) + 1)
plt.plot(epochs, history.history['accuracy'])
plt.plot(epochs, history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], )
plt.show()

model.evaluate(x_test_pad, y_test)

### 13. 예측해 보기

print(f'문자열 : {x_test[0]}')
print(f'Sequence : {x_test_pad[0]}')

# 모델 예측하기(predict)
predict = model.predict (x_test_pad[:1])

print(f'True : {class2label[y_test[0]]}')
print(f'Predict : {class2label[np.argmax(predict)]}')

저작자표시 비영리 변경금지

'Google > Photo' 카테고리의 다른 글

이미지 (0)	2024.09.04
구글 포토(Google Photo) 사진 다운로드하기 (부분 or 일괄) (1)	2020.12.27

이미지

2024. 9. 4. 23:47

### 1.필요한 라이브러리 임포트

import os
from glob import glob
from PIL import Image

import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt

### 2.이미지 파일 가져오기

# 약 3,700장의 꽃 사진 데이터세트를 사용합니다.
# 아래 데이터 가져오기 그냥 사용합니다.

import pathlib
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file('flower_photos', origin=dataset_url, untar=True)
data_dir = pathlib.Path(data_dir)

# 이미지 패스 확인
data_dir

# 이미지 폴더 밑의 폴더 확인

!ls -l /root/.keras/datasets/flower_photos/

# daisy 폴더 안의 이지미 갯수
!ls -l /root/.keras/datasets/flower_photos/daisy | grep jpg | wc -l

# dandelion 폴더 안의 이지미 갯수
!ls -l /root/.keras/datasets/flower_photos/dandelion | grep jpg | wc -l

# roses 폴더 안의 이지미 갯수
!ls -l /root/.keras/datasets/flower_photos/roses | grep jpg | wc -l

# sunflowers 폴더 안의 이지미 갯수
!ls -l /root/.keras/datasets/flower_photos/sunflowers | grep jpg | wc -l

# tulips 폴더 안의 이지미 갯수iiiiiiiiiii
!ls -l /root/.keras/datasets/flower_photos/tulips | grep jpg | wc -l

### 3. os.listdir과 PIL.Image 이용하여 이미지 읽기

# 이미지 패스 지정
daisy_path = '/root/.keras/datasets/flower_photos/daisy/'
dandelion_path = '/root/.keras/datasets/flower_photos/dandelion/'
roses_path = '/root/.keras/datasets/flower_photos/roses/'
sunflowers_path = '/root/.keras/datasets/flower_photos/sunflowers/'
tulips_path = '/root/.keras/datasets/flower_photos/tulips/'

# 이미지 패스의 파말 리스트 만들기
daisy_file = os.listdir(daisy_path)
dandelion_file = os.listdir(dandelion_path)
roses_file = os.listdir(roses_path)
sunflowers_file = os.listdir(sunflowers_path)
tulips_file = os.listdir(tulips_path)

# 이미지 파일 리스트 읽어보기
daisy_file[:2], roses_file[:2]

# 위의 파일 리스트에서 2개씩 읽고 이미지 출력하기
for img_file in daisy_file[:2] :
 img = Image.open(daisy_path + img_file).resize((224,224))
 plt.title(img_file + ' : daisy')
 plt.imshow(img)
 plt.show()

for img_file in roses_file[:2] :
 img = Image.open(roses_path + img_file).resize((224,224))
 plt.title(img_file + ' : roses')
 plt.imshow(img)
 plt.show()

### 4. 이미지 라벨링 포함해서 Class별 이미지 리스트 만들기

# Class 라벨 정의

class2idx = {'daisy' : 0, 'dandelion' : 1, 'roses' : 2, 'sunflowers' : 3, 'tulips' : 4}
idx2class = {0 : 'daisy', 1 : 'dandelion', 2 : 'roses', 3 : 'sunflowers', 4 : 'tulips'}

# 수작업으로 이미지 리스트와 라벨 리스트 만들기

img_list = []
label_list = []

daisy_file = os.listdir(daisy_path)
for img_file in daisy_file :
 img = Image.open(daisy_path + img_file).resize((128,128))
 img = np.array(img)/255. # 이미지 스케일링
 img_list.append(img)
 label_list.append(0) # daisy : 0

dandelion_file = os.listdir(dandelion_path)
for img_file in dandelion_file :
 img = Image.open(dandelion_path + img_file).resize((128,128))
 img = np.array(img)/255. # 이미지 스케일링
 img_list.append(img)
 label_list.append(1) # dandelion : 1

roses_file = os.listdir(roses_path)
for img_file in roses_file :
 img = Image.open(roses_path + img_file).resize((128,128))
 img = np.array(img)/255. # 이미지 스케일링
 img_list.append(img)
 label_list.append(2) # roses : 2

sunflowers_file = os.listdir(sunflowers_path)
for img_file in sunflowers_file :
 img = Image.open(sunflowers_path + img_file).resize((128,128))
 img = np.array(img)/255. # 이미지 스케일링
 img_list.append(img)
 label_list.append(3) # sunflowers : 2

tulips_file = os.listdir(tulips_path)
for img_file in tulips_file :
 img = Image.open(tulips_path + img_file).resize((128,128))
 img = np.array(img)/255. # 이미지 스케일링
 img_list.append(img)
 label_list.append(4) # tulips : 2

# 이미지 리스트, 라벨 리스트루 numpy array 변경
img_list_arr = np.array (img_list)
label_list_arr = np.array(label_list)

# 이미지 리스트, 라벨 리스트 shape 확인
img_list_arr.shape, label_list_arr.shape

### 5. Train/Test 데이터셋 만들기

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(img_list_arr, label_list_arr, test_size =0.3, stratify=label_list_arr, random_state=41)
X_train.shape, X_test.shape , y_train.shape, y_test.shape

## B. Build Model

### 1. Build Model

# Hyperparameter Tunning

num_epochs = 10
batch_size = 32

learning_rate = 0.001
dropout_rate = 0.5

input_shape = (128, 128, 3) # 사이즈 확인

# Sequential 모델 정의
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D

model =Sequential()
model.add( Conv2D(32, kernel_size=(5,5), strides=(1,1), padding='same', activation='relu', input_shape=input_shape))
model.add(MaxPooling2D (pool_size=(2,2), strides=(2,2)))
model.add(Conv2D(64,(2,2), activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(5, activation='softmax'))

# 모델 컴파일
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
 loss = 'sparse_categorical_crossentropy',
 metrics =['accuracy'])

model.summary()

### 2. Callback

# callback : EarlyStopping, ModelCheckpoint

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

# ModelCheckpoint
checkpoint_path = "my_checkpoint.keras"
cp = ModelCheckpoint(filepath=checkpoint_path,
 save_best_only =True,
 monitor = 'val_loss',
 verbose = 1)

### 3. 모델 학습

# num_epochs = 10
# batch_size = 32

# 모델 학습(fit)
history = model.fit (
 X_train, y_train ,
 validation_data=(X_test, y_test),
 epochs=num_epochs,
 batch_size=batch_size,
 callbacks=[es, cp]
)

### 4. 성능 그래프

history.history.keys()

plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Model Accuracy')
plt.show()

### 5. Predict

# Test 데이터로 성능 예측하기

i =1
plt.figure(figsize=(16, 8))
for img, label in zip(X_test[:8], y_test[:8]) :
 pred = model.predict(img.reshape(-1, 128, 128, 3))
 pred_t = np.argmax(pred)
 plt.subplot(2, 4, i)
 plt.title('True Value :{}, Pred Value : {}'.format(label, pred_t))
 plt.imshow(img)
 plt.axis('off')
 i = i+1

## 배운 내용 정리
1. os.listdir과 PIL.Image 활용해서 이미지 Dataset 만들어 보았습니다.
2. os.listdir과 PIL.Image 함수를 이용하여 필요한 데이터를 읽어오고
3. 리스트에 하나씩 넣어주면서 같이 라벨링도 라벨링에 넣어 주었습니다.
4. 이미지 리스트와 라벨 리스트를 numpy array로 변경후 학습 테이더셋과 검증 테이터 셋으로 나눠어 주었습니다.
5. 이후 , CNN 모델을 만들고 학습하고 평가해 보았습니다.
8. 약간의 수작업이 있었는데, 이런식으로 데이터셋 구성 및 모델을 만들수 있었습니다.

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

## C.image_dataset_from_directory 이용하여 데이터셋 만들기

### 1.필요한 라이브러리 임포트

from glob import glob
import os
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt

### 2.이미지 파일 가져오기

# 약 3,700장의 꽃 사진 데이터세트를 사용합니다.
# 아래 데이터 가져오기 그냥 사용합니다.

import pathlib
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file('flower_photos', origin=dataset_url, untar=True)
data_dir = pathlib.Path(data_dir)

# 이미지 패스 확인
data_dir

# 이미지 폴더 밑의 폴더 확인

!ls -l /root/.keras/datasets/flower_photos/

# daisy 폴더 안의 이지미 갯수
!ls -l /root/.keras/datasets/flower_photos/daisy | grep jpg | wc -l

# dandelion 폴더 안의 이지미 갯수
!ls -l /root/.keras/datasets/flower_photos/dandelion | grep jpg | wc -l

# roses 폴더 안의 이지미 갯수
!ls -l /root/.keras/datasets/flower_photos/roses | grep jpg | wc -l

# sunflowers 폴더 안의 이지미 갯수
!ls -l /root/.keras/datasets/flower_photos/sunflowers | grep jpg | wc -l

# tulips 폴더 안의 이지미 갯수
!ls -l /root/.keras/datasets/flower_photos/tulips | grep jpg | wc -l

### 3. 이미지 파일 하나 읽어 이미지 보기

# 이미지 패스 지정
daisy_path = '/root/.keras/datasets/flower_photos/daisy/'

# 이미지 패스의 파말 리스트 만들기
daisy_file = os.listdir(daisy_path)

# 이미지 파일 리스트 읽어보기
daisy_file[:2]

# 위의 파일 리스트에서 2개씩 읽고 이미지 출력하기
for img_file in daisy_file[:2] :
 img = Image.open(daisy_path + img_file).resize((224,224))
 plt.title(img_file + ' : daisy')
 plt.imshow(img)
 plt.show()

### 4. Data Preprocess

### image_dataset_from_directory 이용하여 자동으로 이미지 데이터셋 생성, 라벨링 한꺼번에 처리 할수 있다.

# 하이터 파라미터 정의
input_shape =(224, 224, 3)
batch_size = 32
num_calsses= 5

# 이미지 패스 지정
img_path ='/root/.keras/datasets/flower_photos/'

# image_dataset_from_directory 함수 활용하여
# 이미지 폴더 밑의 이미지들에 대해 원핫인코딩된 labeling수행, 이미지 배치, 셔플 수행
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
 directory= img_path,
 label_mode = 'categorical',
 batch_size = batch_size,
 image_size = (224,224),
 seed = 42,
 shuffle= True,
 validation_split=0.2,
 subset='training'
)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
 directory=img_path,
 label_mode = 'categorical',
 image_size = (224,224),
 batch_size = batch_size,
 seed = 42,
 shuffle= True,
 validation_split=0.2,
 subset='validation'
)

# Class 이름 확인
train_ds.class_names

# 40,000건 중에서 32,000건 Train 사용. test용으로 8,000건 사용
len(train_ds) * 32 , len(test_ds) * 32

batch_img, batch_label = next(iter(train_ds))
batch_img.shape, batch_label.shape

# 샘플 이미지 확인

i = 0
for batch_img, batch_label in train_ds.take(1):
 if i == 0 :
 print(batch_img[i].shape)
 plt.imshow(batch_img[i]/225)
 i = i + 1

## B. 모델링

### 1. Build Model

# Hyperparameter Tunning

num_epochs = 10
batch_size = 32

learning_rate = 0.001
dropout_rate = 0.5

input_shape = (224, 224, 3) # 사이즈 확인
num_classes = 5

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Rescaling

model = Sequential()
model.add(Rescaling(1. / 255)) # 이미지 Rescaling. 없이 하면 성능이 안나옴.
model.add( Conv2D(32, kernel_size=(5,5), strides=(1,1), padding='same', activation='relu', input_shape=input_shape))
model.add( MaxPooling2D (pool_size=(2,2), strides=(2,2)))
model.add(Conv2D(64,(2,2), activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(5, activation='softmax'))

# Model compile
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
 loss= 'categorical_crossentropy',
 metrics =['accuracy'])

### 2. Callback

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(monitor ='val_loss', mode='min', verbose = 1, patience = 3)
chk_path = 'my_checkpoint.keras'
cp = ModelCheckpoint(filepath = chk_path,
 save_best_only = True,
 monitor = 'val_loss',
 verbose = 1)

### 3. 모델 학습

# image_dataset_from_directory 이용하여 데이터 만들었을때 아래와 같이 학습 진행
# num_epochs = 10

# 모델 학습(fit)
history = model.fit(
 train_ds,
 validation_data=(test_ds),
 epochs=10,
 callbacks=[es, cp]
)

### 4. 성능 그래프

history.history.keys()

plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Model Accuracy')
plt.show()

### 5. Predict

len(test_ds) * 32

# 배치사이즈 이미지/라벨 가져오기
batch_img , batch_label = next(iter(test_ds))
type(batch_img), batch_img.shape

# Test 데이터로 성능 예측하기

i = 1
plt.figure(figsize = (16, 30))
for img, label in list(zip(batch_img, batch_label)):
 pred = model.predict(img.numpy().reshape(-1, 224, 224, 3), verbose = 0)
 pred_t = np.argmax(pred)
 plt.subplot(8, 4, i)
 plt.title(f'True Value:{np.argmax(label)}, Pred Value: {pred_t}')
 plt.axis('off')
 plt.imshow(img/255) # 이미지 픽셀값들이 실수형이므로 0~1 사이로 변경해야 에러 안남
 i = i + 1

## 배운 내용 정리
1. DataSet을 만들기 위해 많은 수작업(파일 읽어오기, Pipeline(map, cache, batch, shuffle, prefetch), 라벨 코딩등) 필요
2. 이런 수작업을 케라스 image_dataset_from_directory 이용하여 한번에 처리할수 있습니다.
3. 여러분은 image_dataset_from_directory 와 같은 필요한 함수들을 필요에 따라 잘 사용하시면 되겠습니다.

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

## E. MobileNet Transfer Learning & Fine-tuning 모델링

### 1. Build Model

# 케라스 applicatioins에 어떤 종류의 모델 있는지 확인
dir(tf.keras.applications)

# 사전 훈련된 모델 MobileNetV2에서 기본 모델을 생성합니다.
# 아래와 같은 형식을 MobileNetV2 Transfer Learning 사용하며 됩니다.
base_model = tf.keras.applications.MobileNetV2(input_shape = (224, 224, 3), weights='imagenet', include_top= False)

base_model.summary()

# tf.keras.applications.MobileNetV2 모델은 [-1, 1]의 픽셀 값을 예상하지만 이 시점에서 이미지의 픽셀 값은 [0, 255]입니다.
# MobileNetV2 모델에서 제대로 수행하기 위해 크기를 [-1, 1]로 재조정해야 합니다.(안하고 수행해도 성능 잘 나옴)
# 방법 2가지 있음
# 첫번째 방법 : preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input
# 두번째 방법 : rescale = tf.keras.layers.Rescaling(1./127.5, offset=-1)

# MobileNet V2 베이스 모델 고정하기
base_model.trainable = False

# 모델 구축 : 이미지 픽셀값 조정 수행하기(Rescaling) --> 성능 더 잘 나옴.
inputs = tf.keras.Input(shape=(224,224,3))
x = tf.keras.layers.Rescaling(1./127.5, offset=-1)(inputs)
x = base_model(x, training=False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
output = tf.keras.layers.Dense(5, activation='softmax')(x)

model = tf.keras.Model(inputs=inputs, outputs=output)
model.summary()

# 모델 compile

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), # Optimization
 loss='categorical_crossentropy', # Loss Function
 metrics=['accuracy']) # Metrics / Accuracy

### 2. Callback

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# EarlyStopping
es =EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

# ModelCheckpoint
checkpoint_path = "my_checkpoint.keras"
checkpoint = ModelCheckpoint(filepath=checkpoint_path,
 save_best_only=True,
 monitor='val_loss',
 verbose=1)

### 3. 모델 학습

# image_dataset_from_directory 이용하여 DataSet을 만들었으며
# num_epochs = 10
# batch_size = 32

history = model.fit(
 train_ds,
 validation_data = test_ds,
 epochs=2,
 callbacks=[es, checkpoint]
)

### 4. 성능 그래프

history.history.keys()

plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Model Accuracy')
plt.show()

### 5. Predict

# test_generator 샘플 데이터 가져오기
# 배치 사이즈 32 확인

batch_img, batch_label = next(iter(test_ds))
print(batch_img.shape)
print(batch_label.shape)

# 이미지 rescale 되어 있는 상태
batch_img[0][0][:10]

# 100% 성능 보여줌

i = 1
plt.figure(figsize=(16, 30))
for img, label in list(zip(batch_img, batch_label)):
 pred = model.predict(img.numpy().reshape(-1, 224,224,3), verbose=0)
 pred_t = np.argmax(pred)
 plt.subplot(8, 4, i)
 plt.title(f'True Value:{np.argmax(label)}, Pred Value: {pred_t}')
 plt.imshow(img/255) # 이미지 픽셀값들이 실수형이므로 0~1 사이로 변경해야 에러 안남
 i = i + 1