-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathreports_dynamic.js
More file actions
220 lines (195 loc) · 6.18 KB
/
reports_dynamic.js
File metadata and controls
220 lines (195 loc) · 6.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/**
* Dynamic Reports Generator
*
* This file automatically generates Dataform operations for HTTP Archive reports.
* It creates operations for each combination of:
* - Date range (from startDate to endDate)
* - Metrics (defined in includes/reports.js)
* - SQL types (histogram, timeseries)
* - Lenses (data filters like all, top1k, wordpress, etc.)
*
* Each operation:
* 1. Calculates metrics from crawl data
* 2. Stores results in BigQuery tables
* 3. Exports data to Cloud Storage as JSON
*/
// Initialize configurations
const httpArchiveReports = new reports.HTTPArchiveReports()
const availableMetrics = httpArchiveReports.listMetrics()
const availableLenses = httpArchiveReports.lenses
// Configuration constants
const EXPORT_CONFIG = {
bucket: constants.bucket,
storagePath: constants.storagePath,
dataset: 'reports',
fileFormat: '.json'
}
// Date range for report generation
// Adjust these dates to update reports retrospectively
const DATE_RANGE = {
startDate: constants.currentMonth, // '2025-07-01'
endDate: constants.currentMonth // '2025-07-01'
}
/**
* Generates the Cloud Storage export path for a report
* @param {Object} reportConfig - Report configuration object
* @returns {string} - Cloud Storage object path
*/
function buildExportPath(reportConfig) {
const { sql, date, metric } = reportConfig
let objectPath = EXPORT_CONFIG.storagePath
if (sql.type === 'histogram') {
// Histogram exports are organized by date folders
const dateFolder = date.replaceAll('-', '_')
objectPath += `${dateFolder}/${metric.id}`
} else if (sql.type === 'timeseries') {
// Timeseries exports are organized by metric
objectPath += metric.id
} else {
throw new Error(`Unknown SQL type: ${sql.type}`)
}
return objectPath + EXPORT_CONFIG.fileFormat
}
/**
* Generates the BigQuery export query for a report
* @param {Object} reportConfig - Report configuration object
* @returns {string} - SQL query for exporting data
*/
function buildExportQuery(reportConfig) {
const { sql, date, metric, lens, tableName } = reportConfig
let query
if (sql.type === 'histogram') {
query = `
SELECT
* EXCEPT(date, metric, lens)
FROM \`${EXPORT_CONFIG.dataset}.${tableName}\`
WHERE date = '${date}'
AND metric = '${metric.id}'
AND lens = '${lens.name}'
ORDER BY client, bin ASC
`
} else if (sql.type === 'timeseries') {
query = `
SELECT
UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
* EXCEPT(date, metric, lens)
FROM \`${EXPORT_CONFIG.dataset}.${tableName}\`
WHERE
date = '${date}'
AND metric = '${metric.id}'
AND lens = '${lens.name}'
ORDER BY date, client DESC
`
} else {
throw new Error(`Unknown SQL type: ${sql.type}`)
}
// Convert to single line for JSON embedding
return query.replace(/[\r\n]+/g, ' ').trim()
}
/**
* Creates a report configuration object
* @param {string} date - Report date (YYYY-MM-DD)
* @param {Object} metric - Metric configuration
* @param {Object} sql - SQL configuration (type and query)
* @param {string} lensName - Lens name
* @param {string} lensSQL - Lens SQL filter
* @returns {Object} - Complete report configuration
*/
function createReportConfig(date, metric, sql, lensName, lensSQL) {
return {
date,
metric,
sql,
lens: { name: lensName, sql: lensSQL },
devRankFilter: constants.devRankFilter,
tableName: sql.type === 'timeseries' ? sql.type : `${metric.id}_${sql.type}`
}
}
/**
* Generates all report configurations for the specified date range
* @returns {Array} - Array of report configuration objects
*/
function generateReportConfigurations() {
const reportConfigs = []
// Generate configurations for each date in range
for (let date = DATE_RANGE.endDate;
date >= DATE_RANGE.startDate;
date = constants.fnPastMonth(date)) {
// For each available metric
availableMetrics.forEach(metric => {
// For each SQL type (histogram, timeseries)
metric.SQL.forEach(sql => {
// For each available lens (all, top1k, wordpress, etc.)
Object.entries(availableLenses).forEach(([lensName, lensSQL]) => {
const config = createReportConfig(date, metric, sql, lensName, lensSQL)
reportConfigs.push(config)
})
})
})
}
return reportConfigs
}
/**
* Creates a Dataform operation name for a report configuration
* @param {Object} reportConfig - Report configuration object
* @returns {string} - Operation name
*/
function createOperationName(reportConfig) {
const { tableName, date, lens } = reportConfig
return `${tableName}_${date}_${lens.name}`
}
/**
* Generates the SQL for a Dataform operation
* @param {Object} ctx - Dataform context
* @param {Object} reportConfig - Report configuration object
* @returns {string} - Complete SQL for the operation
*/
function generateOperationSQL(ctx, reportConfig) {
const { date, metric, lens, sql, tableName } = reportConfig
return `
DECLARE job_config JSON;
/* First report run - uncomment to create table
CREATE TABLE IF NOT EXISTS ${EXPORT_CONFIG.dataset}.${tableName}
PARTITION BY date
CLUSTER BY metric, lens, client
AS
*/
--/* Subsequent report run
DELETE FROM ${EXPORT_CONFIG.dataset}.${tableName}
WHERE date = '${date}'
AND metric = '${metric.id}'
AND lens = '${lens.name}';
INSERT INTO ${EXPORT_CONFIG.dataset}.${tableName}
--*/
SELECT
DATE('${date}') AS date,
'${metric.id}' AS metric,
'${lens.name}' AS lens,
*
FROM (
${sql.query(ctx, reportConfig)}
);
SET job_config = TO_JSON(
STRUCT(
"cloud_storage" AS destination,
STRUCT(
"httparchive" AS bucket,
"${buildExportPath(reportConfig)}" AS name
) AS config,
r"${buildExportQuery(reportConfig)}" AS query
)
);
SELECT reports.run_export_job(job_config);
`
}
// Generate all report configurations
const reportConfigurations = generateReportConfigurations()
// Create Dataform operations for each report configuration
reportConfigurations.forEach(reportConfig => {
const operationName = createOperationName(reportConfig)
operate(operationName, {
disabled: true,
})
.tags(['crawl_complete', 'crawl_reports'])
.queries(ctx => generateOperationSQL(ctx, reportConfig))
})