Upload feature_extraction_moment.py
Browse files- feature_extraction_moment.py +126 -19
feature_extraction_moment.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
# - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
|
3 |
# - input validation
|
4 |
|
5 |
-
from typing import List, Optional, Union
|
6 |
|
7 |
from pandas import DataFrame
|
8 |
import numpy as np
|
@@ -29,63 +29,170 @@ class MomentFeatureExtractor(FeatureExtractionMixin):
|
|
29 |
super().__init__(**kwargs)
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def __call__(
|
33 |
self,
|
34 |
time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
|
35 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
36 |
torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
|
|
|
|
|
37 |
) -> BatchFeature:
|
38 |
if time_series is not None:
|
39 |
-
time_series_values = self._convert_time_series(time_series, return_tensors, torch_dtype)
|
40 |
else:
|
41 |
time_series_values = None
|
|
|
42 |
|
43 |
-
return BatchFeature(data={"time_series_values": time_series_values})
|
44 |
|
45 |
|
46 |
-
def _convert_time_series(self, time_series, return_tensors, torch_dtype):
|
47 |
# DataFrame, np.ndarray, または torch.Tensor を torch.Tensor に変換
|
48 |
if isinstance(time_series, list):
|
49 |
# リスト内の各要素を torch.Tensor に変換し、最終的には1つのTensorに結合
|
50 |
-
|
|
|
|
|
|
|
|
|
51 |
else:
|
52 |
time_series_tensor = self._convert_to_tensor(time_series, torch_dtype)
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
elif time_series_tensor.dim() == 2:
|
58 |
-
time_series_tensor = time_series_tensor.unsqueeze(0)
|
59 |
-
elif time_series_tensor.dim() == 1:
|
60 |
-
time_series_tensor = time_series_tensor.unsqueeze(0).unsqueeze(0)
|
61 |
|
62 |
# 形式の出力
|
63 |
batch_size, n_channels, d_model = time_series_tensor.shape
|
64 |
logger.info(f"Batch size: {batch_size}, Number of channels: {n_channels}, Dimension of model: {d_model}")
|
65 |
|
66 |
-
# seq_lenを最大値512
|
67 |
if time_series_tensor.shape[2] > 512:
|
68 |
time_series_tensor = time_series_tensor[:, :, :512]
|
69 |
logger.info("Sequence length has been truncated to 512.")
|
70 |
|
71 |
# return_tensorsの指定に応じてデータ形式を変換
|
72 |
if return_tensors == 'pt' or return_tensors == TensorType.PYTORCH:
|
73 |
-
return time_series_tensor
|
74 |
elif return_tensors == 'np' or return_tensors == TensorType.NUMPY:
|
75 |
-
return time_series_tensor.numpy()
|
76 |
elif return_tensors == 'tf' or return_tensors == TensorType.TENSORFLOW:
|
77 |
-
return tf.convert_to_tensor(time_series_tensor.numpy())
|
78 |
elif return_tensors == 'jax' or return_tensors == TensorType.JAX:
|
79 |
-
return jnp.array(time_series_tensor.numpy())
|
80 |
else:
|
81 |
raise ValueError("Unsupported return_tensors type")
|
82 |
|
83 |
def _convert_to_tensor(self, time_series, torch_dtype):
|
84 |
if isinstance(time_series, DataFrame):
|
85 |
time_series_tensor = torch.tensor(time_series.values, dtype=torch_dtype).t()
|
86 |
-
elif isinstance(time_series, np.ndarray):
|
87 |
time_series_tensor = torch.tensor(time_series, dtype=torch_dtype)
|
88 |
elif isinstance(time_series, torch.Tensor):
|
89 |
time_series_tensor = time_series.to(torch_dtype)
|
90 |
|
91 |
return time_series_tensor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
# - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
|
3 |
# - input validation
|
4 |
|
5 |
+
from typing import List, Optional, Union, Literal, Tuple
|
6 |
|
7 |
from pandas import DataFrame
|
8 |
import numpy as np
|
|
|
29 |
super().__init__(**kwargs)
|
30 |
|
31 |
|
32 |
+
"""
|
33 |
+
padding ( bool、strまたはPaddingStrategy、オプション、デフォルトはFalse):
|
34 |
+
paddingをアクティブ化および制御します。次の値を受け入れます:
|
35 |
+
- True or 'longest': バッチ内の最長シーケンスにパディングします (シーケンスが 1 つだけの場合はパディングしません)。
|
36 |
+
- 'max_length': 引数で指定された最大長までパディングします。max_length引数が指定されていない場合は、モデルで許容される最大入力長までパディングします。
|
37 |
+
- False or 'do_not_pad'(デフォルト): パディングなし (つまり、異なる長さのシーケンスを含むバッチを出力できます)。
|
38 |
+
"""
|
39 |
def __call__(
|
40 |
self,
|
41 |
time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
|
42 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
43 |
torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
|
44 |
+
padding: Union[bool, str] = False, # 追加
|
45 |
+
max_length: int = None, # 追加
|
46 |
) -> BatchFeature:
|
47 |
if time_series is not None:
|
48 |
+
time_series_values, input_mask = self._convert_time_series(time_series, return_tensors, torch_dtype, max_length)
|
49 |
else:
|
50 |
time_series_values = None
|
51 |
+
input_mask = None
|
52 |
|
53 |
+
return BatchFeature(data={"time_series_values": time_series_values, "input_mask": input_mask})
|
54 |
|
55 |
|
56 |
+
def _convert_time_series(self, time_series, return_tensors, torch_dtype, padding, max_length):
|
57 |
# DataFrame, np.ndarray, または torch.Tensor を torch.Tensor に変換
|
58 |
if isinstance(time_series, list):
|
59 |
# リスト内の各要素を torch.Tensor に変換し、最終的には1つのTensorに結合
|
60 |
+
time_series_list = [self._convert_to_tensor(ts, torch_dtype) for ts in time_series]
|
61 |
+
# 次元数の確認
|
62 |
+
time_series_list = [self._convert_tensor_dim(ts, dim=2) for ts in time_series_list]
|
63 |
+
# trancate, padding
|
64 |
+
time_series_tensor, input_mask = self._pad_time_series(time_series_list, padding, max_length)
|
65 |
else:
|
66 |
time_series_tensor = self._convert_to_tensor(time_series, torch_dtype)
|
67 |
+
# 次元数の確認
|
68 |
+
time_series_tensor = self._convert_tensor_dim(time_series_tensor, dim=3)
|
69 |
+
# trancate, padding
|
70 |
+
time_series_tensor, input_mask = self._pad_time_series(time_series_tensor, padding, max_length)
|
|
|
|
|
|
|
|
|
71 |
|
72 |
# 形式の出力
|
73 |
batch_size, n_channels, d_model = time_series_tensor.shape
|
74 |
logger.info(f"Batch size: {batch_size}, Number of channels: {n_channels}, Dimension of model: {d_model}")
|
75 |
|
76 |
+
# seq_lenを最大値512までに切り詰め
|
77 |
if time_series_tensor.shape[2] > 512:
|
78 |
time_series_tensor = time_series_tensor[:, :, :512]
|
79 |
logger.info("Sequence length has been truncated to 512.")
|
80 |
|
81 |
# return_tensorsの指定に応じてデータ形式を変換
|
82 |
if return_tensors == 'pt' or return_tensors == TensorType.PYTORCH:
|
83 |
+
return time_series_tensor, input_mask
|
84 |
elif return_tensors == 'np' or return_tensors == TensorType.NUMPY:
|
85 |
+
return time_series_tensor.numpy(), input_mask
|
86 |
elif return_tensors == 'tf' or return_tensors == TensorType.TENSORFLOW:
|
87 |
+
return tf.convert_to_tensor(time_series_tensor.numpy()), input_mask
|
88 |
elif return_tensors == 'jax' or return_tensors == TensorType.JAX:
|
89 |
+
return jnp.array(time_series_tensor.numpy()), input_mask
|
90 |
else:
|
91 |
raise ValueError("Unsupported return_tensors type")
|
92 |
|
93 |
def _convert_to_tensor(self, time_series, torch_dtype):
|
94 |
if isinstance(time_series, DataFrame):
|
95 |
time_series_tensor = torch.tensor(time_series.values, dtype=torch_dtype).t()
|
96 |
+
elif isinstance(time_series, np.ndarray) or isinstance(time_series, list):
|
97 |
time_series_tensor = torch.tensor(time_series, dtype=torch_dtype)
|
98 |
elif isinstance(time_series, torch.Tensor):
|
99 |
time_series_tensor = time_series.to(torch_dtype)
|
100 |
|
101 |
return time_series_tensor
|
102 |
+
|
103 |
+
def _convert_tensor_dim(self, time_series, dim=3):
|
104 |
+
if time_series.dim() > dim:
|
105 |
+
raise ValueError("time_series must not have more than 3 dimensions")
|
106 |
+
|
107 |
+
while time_series.dim() < dim:
|
108 |
+
time_series = time_series.unsqueeze(0)
|
109 |
+
|
110 |
+
return time_series
|
111 |
+
|
112 |
+
|
113 |
+
def _pad_time_series(
|
114 |
+
time_series_values: Union[torch.Tensor, List[torch.Tensor]],
|
115 |
+
padding: Union[bool, Literal['longest', 'max_length', 'do_not_pad']] = 'do_not_pad',
|
116 |
+
max_length: Union[int, None] = None
|
117 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
118 |
+
"""
|
119 |
+
時系列データにパディングを適用し、対応するinput_maskを生成する関数。
|
120 |
+
|
121 |
+
Args:
|
122 |
+
time_series_values (Union[torch.Tensor, List[torch.Tensor]]):
|
123 |
+
パディングする時系列データ。
|
124 |
+
3次元テンソル (batch_size, n_channels, seq_len) または
|
125 |
+
2次元テンソル (n_channels, seq_len) のリストを想定。
|
126 |
+
padding (Union[bool, Literal['longest', 'max_length', 'do_not_pad']], optional):
|
127 |
+
パディングの種類。デフォルトは 'do_not_pad'。
|
128 |
+
- True または 'longest': バッチ内の最長シーケンスにパディング
|
129 |
+
- 'max_length': 指定された最大長までパディング
|
130 |
+
- False または 'do_not_pad': パディングなし(最短シーケンスに合わせて切り捨て)
|
131 |
+
max_length (Union[int, None], optional):
|
132 |
+
'max_length' パディング時の最大長。
|
133 |
+
指定がない場合は512を使用。デフォルトは None。
|
134 |
+
|
135 |
+
Returns:
|
136 |
+
Tuple[torch.Tensor, torch.Tensor]:
|
137 |
+
- パディングされた時系列データ。形状は (batch_size, n_channels, padded_seq_len)。
|
138 |
+
- input_mask。形状は (batch_size, padded_seq_len)。
|
139 |
+
1はデータが存在する部分、0はパディングされた部分を示す。
|
140 |
+
|
141 |
+
Raises:
|
142 |
+
ValueError: サポートされていない入力形状、無効なパディングオプション、
|
143 |
+
不適切なmax_length、またはチャンネル数の不一致の場合。
|
144 |
+
"""
|
145 |
+
# max_lengthの検証
|
146 |
+
if max_length is not None:
|
147 |
+
if not isinstance(max_length, int) or max_length <= 0:
|
148 |
+
raise ValueError("max_length は正の整数である必要があります。")
|
149 |
+
|
150 |
+
if isinstance(time_series_values, list):
|
151 |
+
if not all(isinstance(ts, torch.Tensor) and ts.dim() == 2 for ts in time_series_values):
|
152 |
+
raise ValueError("リストの各要素は2次元のtorch.Tensorである必要があります。")
|
153 |
+
|
154 |
+
batch_size = len(time_series_values)
|
155 |
+
n_channels = time_series_values[0].shape[0]
|
156 |
+
seq_lens = [ts.shape[1] for ts in time_series_values]
|
157 |
+
|
158 |
+
# チャンネル数の一貫性チェック
|
159 |
+
if not all(ts.shape[0] == n_channels for ts in time_series_values):
|
160 |
+
raise ValueError("全ての時系列データは同じチャンネル数を持つ必要があります。")
|
161 |
+
|
162 |
+
elif isinstance(time_series_values, torch.Tensor):
|
163 |
+
if time_series_values.dim() == 3:
|
164 |
+
batch_size, n_channels, seq_len = time_series_values.shape
|
165 |
+
seq_lens = [seq_len] * batch_size
|
166 |
+
time_series_values = [time_series_values[i] for i in range(batch_size)]
|
167 |
+
elif time_series_values.dim() == 2:
|
168 |
+
n_channels, seq_len = time_series_values.shape
|
169 |
+
batch_size = 1
|
170 |
+
seq_lens = [seq_len]
|
171 |
+
time_series_values = [time_series_values]
|
172 |
+
else:
|
173 |
+
raise ValueError("テンソルは2次元または3次元である必要があります。")
|
174 |
+
else:
|
175 |
+
raise ValueError("入力は torch.Tensor または torch.Tensor のリストである必要があります。")
|
176 |
+
|
177 |
+
if padding == True or padding == 'longest':
|
178 |
+
target_len = max(seq_lens)
|
179 |
+
elif padding == 'max_length':
|
180 |
+
target_len = max_length if max_length is not None else 512
|
181 |
+
elif padding == False or padding == 'do_not_pad':
|
182 |
+
target_len = min(seq_lens)
|
183 |
+
else:
|
184 |
+
raise ValueError("無効なパディングオプションです。")
|
185 |
+
|
186 |
+
# デバイスの一貫性を保証
|
187 |
+
device = time_series_values[0].device
|
188 |
+
|
189 |
+
padded_values = torch.zeros((batch_size, n_channels, target_len), dtype=time_series_values[0].dtype, device=device)
|
190 |
+
input_mask = torch.zeros((batch_size, target_len), dtype=torch.bool, device=device)
|
191 |
+
|
192 |
+
for i in range(batch_size):
|
193 |
+
seq = time_series_values[i]
|
194 |
+
length = min(seq.shape[1], target_len)
|
195 |
+
padded_values[i, :, :length] = seq[:, :length]
|
196 |
+
input_mask[i, :length] = True
|
197 |
+
|
198 |
+
return padded_values, input_mask
|