bin/csvstat.py at main · DWay17/bin · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python3
import csv
import argparse
from collections import Counter

def detect_separator(line):
    # Count occurrences of common delimiters in the line
    potential_separators = [",", ";", "|", "\t"]
    separator_counts = Counter({sep: line.count(sep) for sep in potential_separators})

    # Return the separator with the highest count
    detected_separator = separator_counts.most_common(1)[0][0] if separator_counts else ","
    return detected_separator

def analyze_csv(filename, separator=None, output_separator="\t"):
    print("==> " + filename + " <==")
    # Open the file and detect the separator if not provided
    with open(filename, "r") as file:
        first_line = file.readline().strip()
        if separator is None:
            separator = detect_separator(first_line)

    # Initialize variables
    col_names = []
    col_types = []
    empty_counts = []
    min_nums = []
    max_nums = []
    sum = []
    min_alphas = []
    max_alphas = []
    distinct_counts = []
    top_3_values = []

    # Re-open the file for processing
    with open(filename, "r") as file:
        reader = csv.reader(file, delimiter=separator)
        for row_index, row in enumerate(reader):
            if row_index == 0:
                # Header row
                col_names = row
                col_types = ["empty"] * len(col_names)
                empty_counts = [0] * len(col_names)
                filled_counts = [0] * len(col_names)
                num_counts = [0] * len(col_names)
                alpha_counts = [0] * len(col_names)
                alphanum_counts = [0] * len(col_names)
                min_nums = [None] * len(col_names)
                max_nums = [None] * len(col_names)
                sum = [0] * len(col_names)
                min_alphas = [None] * len(col_names)
                max_alphas = [None] * len(col_names)
                distinct_counts = [{} for _ in col_names]
                top_3_values = [[] for _ in col_names]
                continue

            # Analyze each cell
            for i, cell in enumerate(row):
                cell = cell.strip()  # Remove leading/trailing whitespace
                if cell == "":
                    empty_counts[i] += 1
                    cell_type = "empty"
                else:
                    filled_counts[i] += 1
                    # Detect cell type
                    if cell.replace(".", "", 1).isdigit() and cell.count(".") <= 1:  # Numeric
                        cell_type = "num"
                        num_counts[i] += 1
                        value = float(cell) if "." in cell else int(cell)  # Use int if no decimal
                        sum[i] += value
                        if min_nums[i] is None or value < min_nums[i]:
                            min_nums[i] = value
                        if max_nums[i] is None or value > max_nums[i]:
                            max_nums[i] = value
                    elif cell.isalpha():  # Alphabetic
                        alpha_counts[i] += 1
                        cell_type = "alpha"
                    else:  # Alphanumeric or mixed
                        alphanum_counts[i] += 1
                        cell_type = "alphanum"

                # Calculate min and max for alphas (length-based)
                length = len(cell)
                if min_alphas[i] is None or length < min_alphas[i]:
                    min_alphas[i] = length
                if max_alphas[i] is None or length > max_alphas[i]:
                    max_alphas[i] = length

                # Count distinct values
                if cell not in distinct_counts[i]:
                    distinct_counts[i][cell] = 0
                distinct_counts[i][cell] += 1

                # Evolve column type
                if col_types[i] == "empty":
                    col_types[i] = cell_type
                elif col_types[i] != "alphanum" and col_types[i] != cell_type:
                    col_types[i] = "alphanum"

    # Compute top 3 values for each column
    for i in range(len(col_names)):
        sorted_values = sorted(distinct_counts[i].items(), key=lambda x: (-x[1], x[0]))
        top_3_values[i] = sorted_values[:3]  # Includes value and count

    # Output the results
    # Print column names
    print(f"{'Column Names':<16}" + output_separator + output_separator.join(f"{col:<16}" for col in col_names))

    # Print column types
    print(f"{'Column Types':<16}" + output_separator + output_separator.join(f"{typ:<16}" for typ in col_types))

    # Print empty cell counts
    print(f"{'Empty Counts':<16}" + output_separator + output_separator.join(f"{count:<16}" for count in empty_counts))

    # Print filled cell counts
    print(f"{'Filled Counts':<16}" + output_separator + output_separator.join(f"{count:<16}" for count in filled_counts))


    # Print num cell counts
    print(f"{'num Counts':<16}" + output_separator + output_separator.join(f"{count:<16}" for count in num_counts))

    # Print alpha cell counts
    print(f"{'alpha Counts':<16}" + output_separator + output_separator.join(f"{count:<16}" for count in alpha_counts))

    # Print alphanum cell counts
    print(f"{'alphanum Counts':<16}" + output_separator + output_separator.join(f"{count:<16}" for count in alphanum_counts))

    # Print min values
    min_values = []
    for i, col_type in enumerate(col_types):
        if col_type == "num":
            min_values.append(str(min_nums[i]) if min_nums[i] is not None else "")
        else:  # For alpha and alphanum
            min_values.append(str(min_alphas[i]) if min_alphas[i] is not None else "")
    print(f"{'Min Values':<16}" + output_separator + output_separator.join(f"{val:<16}" for val in min_values))

    # Print max values
    max_values = []
    for i, col_type in enumerate(col_types):
        if col_type == "num":
            max_values.append(str(max_nums[i]) if max_nums[i] is not None else "")
        else:  # For alpha and alphanum
            max_values.append(str(max_alphas[i]) if max_alphas[i] is not None else "")
    print(f"{'Max Values':<16}" + output_separator + output_separator.join(f"{val:<16}" for val in max_values))

    # print average values
    # Calculate averages
    averages = []
    for i in range(len(col_names)):
        if num_counts[i] > 0:  # Only calculate average if there are numeric values
            averages.append(str(round(sum[i] / num_counts[i], 2)))  # Rounded to 2 decimal places
        else:
            averages.append("")  # No average if there are no numeric values
    print(f"{'Averages':<16}" + output_separator + output_separator.join(f"{val:<16}" for val in averages))


    # Print distinct value counts
    distinct_values_count = [str(len(distinct_counts[i])) for i in range(len(col_names))]
    print(f"{'Distinct Values':<16}" + output_separator + output_separator.join(f"{count:<16}" for count in distinct_values_count))

    # Print top 3 values with their counts
    for rank in range(3):  # Top 1, Top 2, Top 3
        row_label = f"Top{rank + 1}"
        row_values = []
        for i in range(len(col_names)):
            if rank < len(top_3_values[i]):
                value, count = top_3_values[i][rank]
                row_values.append(f"{value} : {count}")
            else:
                row_values.append("")
        print(f"{row_label:<16}" + output_separator + output_separator.join(f"{val:<16}" for val in row_values))

if __name__ == "__main__":
    # Set up argument parser
    parser = argparse.ArgumentParser(description="Analyze a CSV file.")
    parser.add_argument("filename", help="Path to the input CSV file")
    parser.add_argument(
        "--separator", "-s", help="Column separator (auto-detects if not provided)"
    )
    parser.add_argument(
        "--output_separator",
        "-o",
        default="\t",
        help="Output separator (default: tab)",
    )

    # Parse command-line arguments
    args = parser.parse_args()

    # Call the function with parsed arguments
    analyze_csv(args.filename, args.separator, args.output_separator)