You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
6.3 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import numpy as np
from collections import defaultdict
def cel_index(freq):
#####聚类########
points = np.array(list(freq.keys()))
weights = np.array([freq[tuple(p)] for p in points])
# 初始化聚类中心:选择权重最大的两个点
top_indices = np.argsort(weights)[-2:]
centroids = points[top_indices].astype(float)
# K-means聚类
max_iters = 100
tolerance = 1e-6
prev_centroids = None
for _ in range(max_iters):
# 计算每个点到各中心的距离
dists_temp = np.minimum(np.abs(points[:, np.newaxis, :] - centroids),
360 - np.abs(points[:, np.newaxis, :] - centroids))
dists = np.sqrt((dists_temp ** 2).sum(axis=2))
# 分配点到最近的中心
cluster_labels = np.argmin(dists, axis=1)
# 更新中心位置
new_centroids = []
for i in range(2):
mask = (cluster_labels == i)
if np.any(mask):
weighted_sum = (points[mask].T * weights[mask]).sum(axis=1)
total_weight = weights[mask].sum()
new_center = weighted_sum / total_weight
new_centroids.append(new_center)
else:
new_centroids.append(centroids[i]) # 防止空簇
new_centroids = np.array(new_centroids)
# 检查收敛
if prev_centroids is not None and np.linalg.norm(new_centroids - prev_centroids) < tolerance:
break
centroids = new_centroids
prev_centroids = centroids.copy()
# 最终分配
dists_temp = np.minimum(np.abs(points[:, np.newaxis, :] - centroids),
360 - np.abs(points[:, np.newaxis, :] - centroids))
dists = np.sqrt((dists_temp ** 2).sum(axis=2))
cluster_labels = np.argmin(dists, axis=1)
# 三倍标准差过滤离群点并计算统计信息
result_list = [] # 存储最终12个值的列表
filtered_data = [] # 存储过滤后的点信息用于可视化
for cluster_idx in range(2):
mask = (cluster_labels == cluster_idx)
cluster_pts = points[mask]
cluster_wts = weights[mask]
# 直接使用原始数据计算中心
weighted_sum = (cluster_pts.T * cluster_wts).sum(axis=1)
total_weight = cluster_wts.sum()
center = weighted_sum / total_weight # 原始中心不再需要过滤后的center_filtered
# 计算统计信息(直接使用原始数据)
total_points = total_weight # 总权重即为有效点数
# 计算坐标绝对偏差(加权平均)
if total_weight > 0 and len(cluster_pts) > 0:
x_abs_diff = np.abs(cluster_pts[:, 0] - center[0])
avg_x_distance = np.sum(x_abs_diff * cluster_wts) / total_weight
y_abs_diff = np.abs(cluster_pts[:, 1] - center[1])
avg_y_distance = np.sum(y_abs_diff * cluster_wts) / total_weight
# 新增:计算最大/最小点距离中心的距离
# 找到横坐标最大和最小的点
max_x_point = cluster_pts[np.argmax(cluster_pts[:, 0])]
min_x_point = cluster_pts[np.argmin(cluster_pts[:, 0])]
max_y_point = cluster_pts[np.argmax(cluster_pts[:, 1])]
min_y_point = cluster_pts[np.argmin(cluster_pts[:, 1])]
# 计算这些点到中心的距离(欧氏距离)
max_x_dist = np.linalg.norm(max_x_point - center)
min_x_dist = np.linalg.norm(min_x_point - center)
max_y_dist = np.linalg.norm(max_y_point - center)
min_y_dist = np.linalg.norm(min_y_point - center)
else:
avg_x_distance = 0
avg_y_distance = 0
max_x_dist = min_x_dist = max_y_dist = min_y_dist = 0
result_list.extend([
center[0], # 中心横坐标(原始计算)
center[1], # 中心纵坐标(原始计算)
avg_x_distance, # 横坐标到中心的平均距离
avg_y_distance, # 纵坐标到中心的平均距离
max_x_dist, # 横坐标最大点到中心的距离
min_x_dist, # 横坐标最小点到中心的距离
max_y_dist, # 纵坐标最大点到中心的距离
min_y_dist # 纵坐标最小点到中心的距离
])
# 添加两个聚类中心的距离特征
if len(centroids) == 2:
# 计算横向距离x方向
dist_x = abs(centroids[0][0] - centroids[1][0])
# 计算纵向距离y方向
dist_y = abs(centroids[0][1] - centroids[1][1])
else:
# 如果聚类中心不足两个,添加默认值
result_list.extend([0, 0])
# 全局幅值统计
if len(points) > 0:
# 提取所有幅值(考虑权重)
amps = points[:, 1] # 幅值在points的第二列
weighted_amps = np.repeat(amps, weights) # 根据权重重复幅值
# 幅值偏斜度/陡峭度(使用无偏估计)
n = len(weighted_amps)
mean_amp = np.mean(weighted_amps)
std_amp = np.std(weighted_amps, ddof=1) # 样本标准差
if std_amp > 1e-8: # 避免除零
amp_skew = np.sum((weighted_amps - mean_amp) ** 3) / (n * std_amp ** 3)
amp_kurt = np.sum((weighted_amps - mean_amp) ** 4) / (n * std_amp ** 4) - 3
else:
amp_skew = amp_kurt = 0
else:
amp_skew = amp_kurt = 0
# 计算低幅值比例直接从points和weights计算
total_points = weights.sum() # 总非零点数(带权重)
if total_points > 0:
low_amp_mask = (points[:, 1] < 32) # 幅值小于32的点
low_amp_count = weights[low_amp_mask].sum()
low_amp_ratio = low_amp_count / total_points
else:
low_amp_ratio = 0
x_bins = 36
x_edges = np.linspace(0, 360, x_bins + 1)
# 统计当前簇在每个区间的权重总和
x_hist, _ = np.histogram(points[:, 0], bins=x_edges,
range=(0, 360))
variance = np.var(x_hist)
zero_count = np.sum(x_hist == 0)
total = len(x_hist)
a = zero_count / total
result_list.extend([
dist_x, # 聚类中心X距离
# dist_y, # 聚类中心Y距离
low_amp_ratio, # 原有低幅值比例
amp_skew, # 全局幅值偏斜度
amp_kurt, # 全局幅值陡峭度
a,
])
return result_list