1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.android.tradefed.device.metric;
17 
18 import static com.android.tradefed.util.proto.TfMetricProtoUtil.stringToMetric;
19 
20 import com.android.os.AtomsProto.Atom;
21 import com.android.os.StatsLog.EventMetricData;
22 import com.android.os.StatsLog.StatsdStatsReport;
23 import com.android.tradefed.config.OptionClass;
24 import com.android.tradefed.device.DeviceNotAvailableException;
25 import com.android.tradefed.device.ITestDevice;
26 import com.android.tradefed.log.LogUtil.CLog;
27 import com.android.tradefed.metrics.proto.MetricMeasurement.Metric;
28 import com.android.tradefed.util.statsd.ConfigUtil;
29 import com.android.tradefed.util.statsd.MetricUtil;
30 
31 import com.google.common.annotations.VisibleForTesting;
32 import com.google.common.collect.Iterables;
33 import com.google.protobuf.InvalidProtocolBufferException;
34 
35 import java.io.IOException;
36 import java.text.SimpleDateFormat;
37 import java.util.Arrays;
38 import java.util.ArrayList;
39 import java.util.Date;
40 import java.util.HashMap;
41 import java.util.List;
42 import java.util.Map;
43 import java.util.NoSuchElementException;
44 import java.util.concurrent.TimeUnit;
45 import java.util.stream.Collectors;
46 
47 /**
48  * Collector that collects timestamps of runtime restarts (system server crashes) during the test
49  * run, if any.
50  *
51  * <p>Outputs results in counts, wall clock time in seconds and in HH:mm:ss format, and system
52  * uptime in nanoseconds and HH:mm:ss format.
53  *
54  * <p>This collector uses two sources for system server crashes:
55  *
56  * <p>
57  *
58  * <ol>
59  *   <li>The system_restart_sec list from StatsdStatsReport, which is a rolling list of 20
60  *       timestamps when the system server crashes, in seconds, with newer crashes appended to the
61  *       end (when the list fills up, older timestamps fall off the beginning).
62  *   <li>The AppCrashOccurred statsd atom, where a system server crash shows up as a system_server
63  *       process crash (this behavior is documented in the statsd atoms.proto definition). The event
64  *       metric gives the device uptime when the crash occurs.
65  * </ol>
66  *
67  * <p>Both can be useful information, as the former makes it easy to correlate timestamps in logs,
68  * while the latter serves as a longevity metric.
69  */
70 @OptionClass(alias = "runtime-restart-collector")
71 public class RuntimeRestartCollector extends BaseDeviceMetricCollector {
72     private static final String METRIC_SEP = "-";
73     public static final String METRIC_PREFIX = "runtime-restart";
74     public static final String METRIC_SUFFIX_COUNT = "count";
75     public static final String METRIC_SUFFIX_SYSTEM_TIMESTAMP_SECS = "timestamps_secs";
76     public static final String METRIC_SUFFIX_SYSTEM_TIMESTAMP_FORMATTED = "timestamps_str";
77     public static final String METRIC_SUFFIX_UPTIME_NANOS = "uptime_nanos";
78     public static final String METRIC_SUFFIX_UPTIME_FORMATTED = "uptime_str";
79     public static final SimpleDateFormat TIME_FORMATTER = new SimpleDateFormat("HH:mm:ss");
80     public static final String SYSTEM_SERVER_KEYWORD = "system_server";
81 
82     private List<ITestDevice> mTestDevices;
83     // Map to store the runtime restart timestamps for each device, keyed by device serial numbers.
84     private Map<String, List<Integer>> mExistingTimestamps = new HashMap<>();
85     // Map to store the statsd config IDs for each device, keyed by device serial numbers.
86     private Map<String, Long> mDeviceConfigIds = new HashMap<>();
87     // Flag variable to indicate whether including the device serial in the metrics is needed.
88     // Device serial is included when there are more than one device under test.
89     private boolean mIncludeDeviceSerial = false;
90 
91     /**
92      * Store the existing timestamps of system server restarts prior to the test run as statsd keeps
93      * a running log of them, and push the config to collect app crashes.
94      */
95     @Override
onTestRunStart(DeviceMetricData runData)96     public void onTestRunStart(DeviceMetricData runData) {
97         mTestDevices = getDevices();
98         mIncludeDeviceSerial = (mTestDevices.size() > 1);
99         for (ITestDevice device : mTestDevices) {
100             // Pull statsd metadata (StatsdStatsReport) from the device and keep track of existing
101             // runtime restart timestamps.
102             try {
103                 List<Integer> existingTimestamps =
104                         getStatsdMetadata(device).getSystemRestartSecList();
105                 mExistingTimestamps.put(device.getSerialNumber(), existingTimestamps);
106             } catch (DeviceNotAvailableException | InvalidProtocolBufferException e) {
107                 // Error is not thrown as we still want to process the other devices.
108                 CLog.e(
109                         "Failed to get statsd metadata from device %s. Exception: %s.",
110                         device.getSerialNumber(), e);
111             }
112 
113             // Register statsd config to collect app crashes.
114             try {
115                 mDeviceConfigIds.put(
116                         device.getSerialNumber(),
117                         pushStatsConfig(
118                                 device, Arrays.asList(Atom.APP_CRASH_OCCURRED_FIELD_NUMBER)));
119             } catch (DeviceNotAvailableException | IOException e) {
120                 // Error is not thrown as we still want to push the config to other devices.
121                 CLog.e(
122                         "Failed to push statsd config to device %s. Exception: %s.",
123                         device.getSerialNumber(), e);
124             }
125         }
126     }
127 
128     /**
129      * Pull the timestamps at the end of the test run and report the difference with existing ones,
130      * if any.
131      */
132     @Override
onTestRunEnd( DeviceMetricData runData, final Map<String, Metric> currentRunMetrics)133     public void onTestRunEnd(
134             DeviceMetricData runData, final Map<String, Metric> currentRunMetrics) {
135         for (ITestDevice device : mTestDevices) {
136             // Pull statsd metadata again to look at the changes of system server crash timestamps
137             // during the test run, and report them.
138             // A copy of the list is created as the list returned by the proto is not modifiable.
139             List<Integer> updatedTimestamps = new ArrayList<>();
140             try {
141                 updatedTimestamps.addAll(getStatsdMetadata(device).getSystemRestartSecList());
142             } catch (DeviceNotAvailableException | InvalidProtocolBufferException e) {
143                 // Error is not thrown as we still want to process the other devices.
144                 CLog.e(
145                         "Failed to get statsd metadata from device %s. Exception: %s.",
146                         device.getSerialNumber(), e);
147                 continue;
148             }
149             if (!mExistingTimestamps.containsKey(device.getSerialNumber())) {
150                 CLog.e("No prior state recorded for device %s.", device.getSerialNumber());
151                 addStatsdStatsBasedMetrics(
152                         currentRunMetrics, updatedTimestamps, device.getSerialNumber());
153             } else {
154                 try {
155                     int lastTimestampBeforeTestRun =
156                             Iterables.getLast(mExistingTimestamps.get(device.getSerialNumber()));
157                     // If the last timestamp is not found, lastIndexOf(...) + 1 returns 0 which is
158                     // what is needed in that situation.
159                     addStatsdStatsBasedMetrics(
160                             currentRunMetrics,
161                             getAllValuesAfter(lastTimestampBeforeTestRun, updatedTimestamps),
162                             device.getSerialNumber());
163                 } catch (NoSuchElementException e) {
164                     // The exception occurs when no prior runtime restarts had been recorded. It is
165                     // thrown from Iterables.getLast().
166                     addStatsdStatsBasedMetrics(
167                             currentRunMetrics, updatedTimestamps, device.getSerialNumber());
168                 }
169             }
170         }
171 
172         // Pull the AppCrashOccurred event metric reports and report the system server crashes
173         // within.
174         for (ITestDevice device : mTestDevices) {
175             if (!mDeviceConfigIds.containsKey(device.getSerialNumber())) {
176                 CLog.e("No config ID is associated with device %s.", device.getSerialNumber());
177                 continue;
178             }
179             long configId = mDeviceConfigIds.get(device.getSerialNumber());
180             List<Long> uptimeListNanos = new ArrayList<>();
181             try {
182                 List<EventMetricData> metricData = getEventMetricData(device, configId);
183                 uptimeListNanos.addAll(
184                         metricData
185                                 .stream()
186                                 .filter(d -> d.hasElapsedTimestampNanos())
187                                 .filter(d -> d.hasAtom())
188                                 .filter(d -> d.getAtom().hasAppCrashOccurred())
189                                 .filter(d -> d.getAtom().getAppCrashOccurred().hasProcessName())
190                                 .filter(
191                                         d ->
192                                                 SYSTEM_SERVER_KEYWORD.equals(
193                                                         d.getAtom()
194                                                                 .getAppCrashOccurred()
195                                                                 .getProcessName()))
196                                 .map(d -> d.getElapsedTimestampNanos())
197                                 .collect(Collectors.toList()));
198             } catch (DeviceNotAvailableException e) {
199                 // Error is not thrown as we still want to process other devices.
200                 CLog.e(
201                         "Failed to retrieve event metric data from device %s. Exception: %s.",
202                         device.getSerialNumber(), e);
203             }
204             addAtomBasedMetrics(currentRunMetrics, uptimeListNanos, device.getSerialNumber());
205             try {
206                 removeConfig(device, configId);
207             } catch (DeviceNotAvailableException e) {
208                 // Error is not thrown as we still want to remove the config from other devices.
209                 CLog.e(
210                         "Failed to remove statsd config from device %s. Exception: %s.",
211                         device.getSerialNumber(), e);
212             }
213         }
214     }
215 
216     /** Helper method to add metrics from StatsdStatsReport according to timestamps. */
addStatsdStatsBasedMetrics( final Map<String, Metric> metrics, List<Integer> timestampsSecs, String serial)217     private void addStatsdStatsBasedMetrics(
218             final Map<String, Metric> metrics, List<Integer> timestampsSecs, String serial) {
219         // Always add a count of system server crashes, regardless of whether there are any.
220         // The statsd metadata-based count is used as the atom-based data can miss runtime restart
221         // instances.
222         // TODO(b/135770315): Re-assess this after the root cause for missing instances in the atom
223         // -based results is found.
224         String countMetricKey = createMetricKey(METRIC_SUFFIX_COUNT, serial);
225         metrics.put(countMetricKey, stringToMetric(String.valueOf(timestampsSecs.size())));
226         // If there are runtime restarts, add a comma-separated list of timestamps.
227         if (!timestampsSecs.isEmpty()) {
228             // Store both the raw timestamp and the formatted, more readable version.
229 
230             String timestampMetricKey =
231                     createMetricKey(METRIC_SUFFIX_SYSTEM_TIMESTAMP_SECS, serial);
232             metrics.put(
233                     timestampMetricKey,
234                     stringToMetric(
235                             timestampsSecs
236                                     .stream()
237                                     .map(t -> String.valueOf(t))
238                                     .collect(Collectors.joining(","))));
239 
240             String formattedTimestampMetricKey =
241                     createMetricKey(METRIC_SUFFIX_SYSTEM_TIMESTAMP_FORMATTED, serial);
242             metrics.put(
243                     formattedTimestampMetricKey,
244                     stringToMetric(
245                             timestampsSecs
246                                     .stream()
247                                     .map(t -> timestampToHoursMinutesSeconds(t))
248                                     .collect(Collectors.joining(","))));
249         }
250     }
251 
252     /** Helper method to add metrics from the AppCrashOccurred atoms according to timestamps. */
addAtomBasedMetrics( final Map<String, Metric> metrics, List<Long> timestampsNanos, String serial)253     private void addAtomBasedMetrics(
254             final Map<String, Metric> metrics, List<Long> timestampsNanos, String serial) {
255         // If there are runtime restarts, add a comma-separated list of device uptime timestamps.
256         if (!timestampsNanos.isEmpty()) {
257             // Store both the raw timestamp and the formatted, more readable version.
258 
259             String uptimeNanosMetricKey = createMetricKey(METRIC_SUFFIX_UPTIME_NANOS, serial);
260             metrics.put(
261                     uptimeNanosMetricKey,
262                     stringToMetric(
263                             timestampsNanos
264                                     .stream()
265                                     .map(t -> String.valueOf(t))
266                                     .collect(Collectors.joining(","))));
267 
268             String formattedUptimeMetricKey =
269                     createMetricKey(METRIC_SUFFIX_UPTIME_FORMATTED, serial);
270             metrics.put(
271                     formattedUptimeMetricKey,
272                     stringToMetric(
273                             timestampsNanos
274                                     .stream()
275                                     .map(t -> nanosToHoursMinutesSeconds(t))
276                                     .collect(Collectors.joining(","))));
277         }
278     }
279 
createMetricKey(String suffix, String serial)280     private String createMetricKey(String suffix, String serial) {
281         return mIncludeDeviceSerial
282                 ? String.join(METRIC_SEP, METRIC_PREFIX, serial, suffix)
283                 : String.join(METRIC_SEP, METRIC_PREFIX, suffix);
284     }
285 
286     /**
287      * Forwarding logic to {@link ConfigUtil} for testing as it is a static utility class.
288      *
289      * @hide
290      */
291     @VisibleForTesting
pushStatsConfig(ITestDevice device, List<Integer> eventAtomIds)292     protected long pushStatsConfig(ITestDevice device, List<Integer> eventAtomIds)
293             throws IOException, DeviceNotAvailableException {
294         return ConfigUtil.pushStatsConfig(device, eventAtomIds);
295     }
296 
297     /**
298      * Forwarding logic to {@link ConfigUtil} for testing as it is a static utility class.
299      *
300      * @hide
301      */
302     @VisibleForTesting
removeConfig(ITestDevice device, long configId)303     protected void removeConfig(ITestDevice device, long configId)
304             throws DeviceNotAvailableException {
305         ConfigUtil.removeConfig(device, configId);
306     }
307 
308     /**
309      * Forwarding logic to {@link MetricUtil} for testing as it is a static utility class.
310      *
311      * @hide
312      */
313     @VisibleForTesting
getStatsdMetadata(ITestDevice device)314     protected StatsdStatsReport getStatsdMetadata(ITestDevice device)
315             throws DeviceNotAvailableException, InvalidProtocolBufferException {
316         return MetricUtil.getStatsdMetadata(device);
317     }
318 
319     /**
320      * Forwarding logic to {@link MetricUtil} for testing as it is a static utility class.
321      *
322      * @hide
323      */
324     @VisibleForTesting
getEventMetricData(ITestDevice device, long configId)325     protected List<EventMetricData> getEventMetricData(ITestDevice device, long configId)
326             throws DeviceNotAvailableException {
327         return MetricUtil.getEventMetricData(device, configId);
328     }
329 
330     /**
331      * Helper method to convert nanoseconds to HH:mm:ss. {@link SimpleDateFormat} is not used here
332      * as the input is not a real epoch-based timestamp.
333      */
nanosToHoursMinutesSeconds(long nanos)334     private String nanosToHoursMinutesSeconds(long nanos) {
335         long hours = TimeUnit.NANOSECONDS.toHours(nanos);
336         nanos -= TimeUnit.HOURS.toNanos(hours);
337         long minutes = TimeUnit.NANOSECONDS.toMinutes(nanos);
338         nanos -= TimeUnit.MINUTES.toNanos(minutes);
339         long seconds = TimeUnit.NANOSECONDS.toSeconds(nanos);
340         return String.format("%02d:%02d:%02d", hours, minutes, seconds);
341     }
342 
343     /** Helper method to format an epoch-based timestamp in seconds to HH:mm:ss. */
timestampToHoursMinutesSeconds(int seconds)344     private String timestampToHoursMinutesSeconds(int seconds) {
345         return TIME_FORMATTER.format(new Date(TimeUnit.SECONDS.toMillis(seconds)));
346     }
347 
348     /** Helper method to get all values in a list that appear after all occurrences of target. */
getAllValuesAfter(int target, List<Integer> l)349     private List<Integer> getAllValuesAfter(int target, List<Integer> l) {
350         return l.subList(l.lastIndexOf(target) + 1, l.size());
351     }
352 }
353