You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
6.3 KiB
Python

2 weeks ago
import numpy as np
from collections import defaultdict
def cel_index(freq):
#####聚类########
points = np.array(list(freq.keys()))
weights = np.array([freq[tuple(p)] for p in points])
# 初始化聚类中心:选择权重最大的两个点
top_indices = np.argsort(weights)[-2:]
centroids = points[top_indices].astype(float)
# K-means聚类
max_iters = 100
tolerance = 1e-6
prev_centroids = None
for _ in range(max_iters):
# 计算每个点到各中心的距离
dists_temp = np.minimum(np.abs(points[:, np.newaxis, :] - centroids),
360 - np.abs(points[:, np.newaxis, :] - centroids))
dists = np.sqrt((dists_temp ** 2).sum(axis=2))
# 分配点到最近的中心
cluster_labels = np.argmin(dists, axis=1)
# 更新中心位置
new_centroids = []
for i in range(2):
mask = (cluster_labels == i)
if np.any(mask):
weighted_sum = (points[mask].T * weights[mask]).sum(axis=1)
total_weight = weights[mask].sum()
new_center = weighted_sum / total_weight
new_centroids.append(new_center)
else:
new_centroids.append(centroids[i]) # 防止空簇
new_centroids = np.array(new_centroids)
# 检查收敛
if prev_centroids is not None and np.linalg.norm(new_centroids - prev_centroids) < tolerance:
break
centroids = new_centroids
prev_centroids = centroids.copy()
# 最终分配
dists_temp = np.minimum(np.abs(points[:, np.newaxis, :] - centroids),
360 - np.abs(points[:, np.newaxis, :] - centroids))
dists = np.sqrt((dists_temp ** 2).sum(axis=2))
cluster_labels = np.argmin(dists, axis=1)
# 三倍标准差过滤离群点并计算统计信息
result_list = [] # 存储最终12个值的列表
filtered_data = [] # 存储过滤后的点信息用于可视化
for cluster_idx in range(2):
mask = (cluster_labels == cluster_idx)
cluster_pts = points[mask]
cluster_wts = weights[mask]
# 直接使用原始数据计算中心
weighted_sum = (cluster_pts.T * cluster_wts).sum(axis=1)
total_weight = cluster_wts.sum()
center = weighted_sum / total_weight # 原始中心不再需要过滤后的center_filtered
# 计算统计信息(直接使用原始数据)
total_points = total_weight # 总权重即为有效点数
# 计算坐标绝对偏差(加权平均)
if total_weight > 0 and len(cluster_pts) > 0:
x_abs_diff = np.abs(cluster_pts[:, 0] - center[0])
avg_x_distance = np.sum(x_abs_diff * cluster_wts) / total_weight
y_abs_diff = np.abs(cluster_pts[:, 1] - center[1])
avg_y_distance = np.sum(y_abs_diff * cluster_wts) / total_weight
# 新增:计算最大/最小点距离中心的距离
# 找到横坐标最大和最小的点
max_x_point = cluster_pts[np.argmax(cluster_pts[:, 0])]
min_x_point = cluster_pts[np.argmin(cluster_pts[:, 0])]
max_y_point = cluster_pts[np.argmax(cluster_pts[:, 1])]
min_y_point = cluster_pts[np.argmin(cluster_pts[:, 1])]
# 计算这些点到中心的距离(欧氏距离)
max_x_dist = np.linalg.norm(max_x_point - center)
min_x_dist = np.linalg.norm(min_x_point - center)
max_y_dist = np.linalg.norm(max_y_point - center)
min_y_dist = np.linalg.norm(min_y_point - center)
else:
avg_x_distance = 0
avg_y_distance = 0
max_x_dist = min_x_dist = max_y_dist = min_y_dist = 0
result_list.extend([
center[0], # 中心横坐标(原始计算)
center[1], # 中心纵坐标(原始计算)
avg_x_distance, # 横坐标到中心的平均距离
avg_y_distance, # 纵坐标到中心的平均距离
max_x_dist, # 横坐标最大点到中心的距离
min_x_dist, # 横坐标最小点到中心的距离
max_y_dist, # 纵坐标最大点到中心的距离
min_y_dist # 纵坐标最小点到中心的距离
])
# 添加两个聚类中心的距离特征
if len(centroids) == 2:
# 计算横向距离x方向
dist_x = abs(centroids[0][0] - centroids[1][0])
# 计算纵向距离y方向
dist_y = abs(centroids[0][1] - centroids[1][1])
else:
# 如果聚类中心不足两个,添加默认值
result_list.extend([0, 0])
# 全局幅值统计
if len(points) > 0:
# 提取所有幅值(考虑权重)
amps = points[:, 1] # 幅值在points的第二列
weighted_amps = np.repeat(amps, weights) # 根据权重重复幅值
# 幅值偏斜度/陡峭度(使用无偏估计)
n = len(weighted_amps)
mean_amp = np.mean(weighted_amps)
std_amp = np.std(weighted_amps, ddof=1) # 样本标准差
if std_amp > 1e-8: # 避免除零
amp_skew = np.sum((weighted_amps - mean_amp) ** 3) / (n * std_amp ** 3)
amp_kurt = np.sum((weighted_amps - mean_amp) ** 4) / (n * std_amp ** 4) - 3
else:
amp_skew = amp_kurt = 0
else:
amp_skew = amp_kurt = 0
# 计算低幅值比例直接从points和weights计算
total_points = weights.sum() # 总非零点数(带权重)
if total_points > 0:
low_amp_mask = (points[:, 1] < 32) # 幅值小于32的点
low_amp_count = weights[low_amp_mask].sum()
low_amp_ratio = low_amp_count / total_points
else:
low_amp_ratio = 0
x_bins = 36
x_edges = np.linspace(0, 360, x_bins + 1)
# 统计当前簇在每个区间的权重总和
x_hist, _ = np.histogram(points[:, 0], bins=x_edges,
range=(0, 360))
variance = np.var(x_hist)
zero_count = np.sum(x_hist == 0)
total = len(x_hist)
a = zero_count / total
result_list.extend([
dist_x, # 聚类中心X距离
# dist_y, # 聚类中心Y距离
low_amp_ratio, # 原有低幅值比例
amp_skew, # 全局幅值偏斜度
amp_kurt, # 全局幅值陡峭度
a,
])
return result_list