?? RIGHT := 110 ??
?? NEWTITLE := 'NOS/VE OS : 180 Monitor DUE processor.' ??
MODULE mtm$process_due_errors;

{ PURPOSE:
{   The purpose of this module is to process the DETECTED UNCORRECTED ERRORS (DUE) triggered by bit 48 in the
{   MCR.  The DUE is logged in a circular CM buffer.  The action then taken is reflected in Figure 9.1-1 of
{   the MIGDS and 180 Operating System action section 4.2.9 of the DFT/OS Interface specification. If certain
{   conditions are met, the CPU which has encountered the DUE will be deconfigured without interrupting the
{   system.

?? NEWTITLE := 'Global Declarations Referenced by This Module', EJECT ??
?? PUSH (LISTEXT := ON) ??
*copyc cml$system_informative_message
*copyc mtt$due_state
*copyc mtt$due_log
*copyc mtt$170_due_info
*copyc ost$cpu_state_table
*copyc ost$exchange_package
*copyc ost$informative_message_record
*copyc ost$processor_id
*copyc ost$processor_id_set
*copyc syt$monitor_request_code
*copyc syt$monitor_status
*copyc tmt$mcr_faults
?? POP ??
*copyc dpp$display_error
*copyc dsp$report_system_message
*copyc mtp$clear_lock
*copyc mtp$error_stop
*copyc mtp$manage_processor_with_due
*copyc mtp$set_lock
*copyc mtp$step_unstep_system
*copyc tmp$cause_task_switch
*copyc tmp$send_monitor_fault
?? EJECT ??
*copyc mtv$scb
*copyc mtv$sys_core_init_complete
*copyc mtv$total_nos_cpu_time
*copyc syv$enable_fault_injection
?? OLDTITLE ??
?? NEWTITLE := 'Global Declarations Declared by This Module', EJECT ??
  VAR
    mtv$no_stop: [XREF] integer,
    mtv$aborted_task_threshold: [XDCL, #GATE] integer := 20,
    mtv$deadstart_due_threshold: [XDCL] integer := 10,
    mtv$due_log: [XDCL] mtt$due_log := [ 0, 'DUE_LOG', 0, [[0, 0], [0, 0], [0, 0]], * ],
    mtv$halt_on_proc_malf: [XDCL, #GATE] boolean := FALSE,
    mtv$170_due_info: [XDCL] mtt$170_due_info := [0, 0, 0, 0],
    mtv$processor_due_threshold: [XDCL, #GATE] integer := 10,
    mtv$time_first_mtr_due: [XDCL] integer := 0;
?? OLDTITLE ??
?? NEWTITLE := 'mtp$process_due', EJECT ??
*copyc mth$process_due
  PROCEDURE [XDCL] mtp$process_due
    (    due_state: mtt$due_state;
         cst_p: ^ost$cpu_state_table;
         xp_p: ^ost$exchange_package);

    CONST
      one_second = 1000000,
      ten_seconds = 10000000;

    VAR
      abort_task: boolean,
      conditions_warrant_cpu_deconfig: boolean,
      current_time: integer,
      process_damaged: boolean,
      processor_selections: ost$processor_id_set,
      processor_set: ost$processor_id_set,
      status: syt$monitor_status,
      xcb_p: ^ost$execution_control_block;

{ Halt if the system attribute Halt_On_Processor_Malf is set.

    dpp$display_error ('DUE Encountered.');
    IF mtv$halt_on_proc_malf THEN
      mtp$step_unstep_system (syc$ic_software_breakpoint, 'VEOS9302- DUE: Halt_On_Proc_Malf set');
    IFEND;

{ Log the DUE.

    log_due (due_state, cst_p, xp_p, process_damaged);

{ Special handling for DUE errors that occur very early in deadstart.
{ Allow a small number of PND DUEs. Note - rest of this proc has to be skipped
{ since tables are not sufficiently initialized to process the failure.

    IF NOT mtv$sys_core_init_complete THEN
      mtv$deadstart_due_threshold := mtv$deadstart_due_threshold - 1;
      IF (mtv$deadstart_due_threshold = 0) OR process_damaged THEN
        mtp$step_unstep_system (syc$ic_fatal_hardware_error,'VEOS520E- FATAL CPU ERROR');
      IFEND;
      RETURN; {<---}
    IFEND;

    abort_task := FALSE;
    conditions_warrant_cpu_deconfig := FALSE;

{ Check for fatal errors: e.g. (DUE and process_damaged) in 180 job or 180 monitor.
{ Also, check for errors which warrant deconfiguration of a CPU out of the system configuration.

    CASE due_state OF
    = mtc$due_in_nos =
       mtp$error_stop ('We should have no 170 DUEs , MTP$PROCESS_DUE');

    = mtc$due_in_180_monitor =
      IF mtv$no_stop = 1 THEN
       dpp$display_error ('Monitor DUE in Idle Mode ignored');
       RETURN;
      IFEND;

      cst_p^.due_count := cst_p^.due_count + 1;
      IF process_damaged  THEN
        conditions_warrant_cpu_deconfig := TRUE;
      ELSE

{ Accept non-clustered DUE errors. Stop if the number of errors exceeds the
{ threshold for a given time period (e.g. 10 in 10 seconds).

      dpp$display_error ('Non-fatal Monitor DUE encountered');
      IF (#free_running_clock (0) - mtv$time_first_mtr_due) > 10*60*ten_seconds THEN
         mtv$time_first_mtr_due := #free_running_clock (0);
         cst_p^.due_count := 1;
         dpp$display_error ('  DUE Threshold not reached, count reset.');
      ELSE
          cst_p^.due_count := cst_p^.due_count + 1;

{ Has the processor exceeded the threshold for DUEs in ten seconds?

          IF cst_p^.due_count > mtv$processor_due_threshold THEN
{           cst_p^.due_count := 0;
            conditions_warrant_cpu_deconfig := TRUE;
          ELSE
            dpp$display_error ('Non-fatal Monitor DUE encountered');
          IFEND;
        IFEND;
      IFEND;

    = mtc$due_in_180_job =

      xcb_p := cst_p^.xcb_p;
      IF process_damaged THEN
        abort_task := TRUE;
        dpp$display_error ('Job Mode DUE, abort task.');
      ELSE

        { Has this task executed for ten seconds without a DUE?
        IF xcb_p^.proc_malf_count < UPPERVALUE(xcb_p^.proc_malf_count) THEN
          xcb_p^.proc_malf_count := xcb_p^.proc_malf_count + 1;
        IFEND;

        IF xcb_p^.cp_time.time_spent_in_job_mode > (xcb_p^.time_last_due + 5*10*ten_seconds) THEN
           cst_p^.due_count := 1;
           dpp$display_error ('  DUE Threshold not reached, count reset.');
        ELSE
          cst_p^.due_count := cst_p^.due_count + 1;

          { Has the processor exceeded the threshold for DUEs in ten seconds?

          IF cst_p^.due_count > mtv$processor_due_threshold THEN
            cst_p^.due_count := 1;

            { Deselect this CPU for this task.

            processor_set := xcb_p^.processor_selections;
            processor_set := processor_set - $ost$processor_id_set [xcb_p^.last_lpid_for_task];

            { Check for remaining processor availability.

            IF cst_p^.ijle_p^.job_scheduler_data.job_class = jmc$maintenance_job_class THEN
              processor_selections := mtv$scb.cpus.logically_on;
            ELSE
              processor_selections := mtv$scb.cpus.available_for_use;
            IFEND;
            IF (processor_set * processor_selections) = $ost$processor_id_set [] THEN
              abort_task := TRUE;
            ELSE
              xcb_p^.processor_selections := processor_set;
            IFEND;
          IFEND;
        IFEND;
        xcb_p^.time_last_due := xcb_p^.cp_time.time_spent_in_job_mode;

        { Give DFT time to catch the DUE before this task can possibly run again.

        tmp$cause_task_switch;

      IFEND;
    ELSE
      mtp$error_stop ('Illegal selection for DUE_STATE, MTP$PROCESS_DUE');
    CASEND;

{ Check for task abort.  If a task need not be aborted, but a processor must be deconfigured, the task will
{ be resurrected during deconfiguration processing, if it is possible to do so.

    IF abort_task AND (due_state = mtc$due_in_180_job) THEN
      log_aborted_task_message (cst_p, xcb_p);
      cst_p^.aborted_task_count := cst_p^.aborted_task_count + 1;

      IF cst_p^.aborted_task_count > mtv$aborted_task_threshold THEN

{ If the processor is over the threshold for aborted tasks, the processor will be turned off, or the system
{ will be stepped.  The task will be aborted as a result of the processor deconfiguration.

        conditions_warrant_cpu_deconfig := TRUE;
      ELSE

{ If the processor is not over the threshold for aborted tasks, abort the task.

        mtp$abort_task_with_due (cst_p, xcb_p);
      IFEND;

    IFEND;

    IF conditions_warrant_cpu_deconfig THEN
      mtp$manage_processor_with_due (xp_p^.last_processor_id);
    IFEND;

  PROCEND mtp$process_due;
?? OLDTITLE ??
?? NEWTITLE := 'mtp$abort_task_with_due', EJECT ??

{ PURPOSE:
{   The purpose of this procedure is to send the task with a DUE a monitor fault.  The task will abort.
{
{  MTP$ABORT_TASK_WITH_DUE (CST_P, XCB_P)
{
{  CST_P: (INPUT)  Pointer to the CPU state table.
{  XCB_P: (INPUT)  Pointer to the execution control block of the task which is to be aborted.

  PROCEDURE [XDCL] mtp$abort_task_with_due
    (    cst_p: ^ost$cpu_state_table;
         xcb_p: ^ost$execution_control_block);

    VAR
      due_fault_contents_p: ^tmt$mcr_faults,
      message: string (70),
      monitor_fault: ost$monitor_fault,
      null_pva: [READ] ost$pva := [1, 0fff(16), 7fffffff(16)];

    monitor_fault.identifier := tmc$mcr_fault;
    due_fault_contents_p := #LOC (monitor_fault.contents);
    due_fault_contents_p^.faults := $ost$monitor_conditions [osc$detected_uncorrected_err];
    due_fault_contents_p^.untranslatable_pointer := null_pva;
    tmp$send_monitor_fault (cst_p^.taskid, ^monitor_fault, TRUE);

    message := ' ';
    message (1,50) := 'Uncorrected CPU Error (DUE) occured; task aborted:';
    dpp$display_error (message);

    message := ' ';
    message (1,7) := 'Task = ';
    message (8,31) := xcb_p^.save9;
    message (39,11) := ', in job = ';
    message (50,19) := cst_p^.jcb_p^.system_name;
    dpp$display_error (message);

  PROCEND mtp$abort_task_with_due;
?? OLDTITLE ??
?? NEWTITLE := 'log_due', EJECT ??

{ PURPOSE:
{   The purpose of this procedure is to log the DUE processor errors in a circular error log.
{
{  LOG_DUE (DUE_STATE, CST_P, XP_P, REGS_P, PROCESS_DAMAGED)
{
{  DUE_STATE:       (INPUT)  This parameter specifies the state of the system
{                            in which DUE occured.
{  CST_P:           (INPUT)  Pointer to the CPU state table.
{  XP_P:            (INPUT)  Pointer to the exchange package.
{  PROCESS_DAMAGED: (OUTPUT) Boolean indicating whether the executing task has
{                            been damaged by the DUE.

  PROCEDURE log_due
    (    due_state: mtt$due_state;
         cst_p: ^ost$cpu_state_table;
         xp_p: ^ost$exchange_package;
     VAR process_damaged: boolean);

    CONST
      three_seconds = 3000000;

    VAR
      duration: ost$free_running_clock,
      current_time: ost$free_running_clock,
      i: mtt$due_log_entries,
      locked: boolean;

    process_damaged := NOT (osc$process_not_damaged IN xp_p^.flags);

{ Try for 3 seconds to lock the DUE log.  If unsuccessful, just RETURN and let the previous DUE
{ (that locked the log) finish processing.

    current_time := #free_running_clock (0);
    duration := current_time + three_seconds;
    REPEAT
      mtp$set_lock (mtv$due_log.lock, locked);
    UNTIL locked OR (#free_running_clock (0) > duration) OR (cst_p^.next_processor_state <> cmc$on);

    IF NOT locked THEN
      RETURN;
    IFEND;

{ Make the entry into the DUE log.  NOTE: the DUE log is a circular log.

    i := (mtv$due_log.next_i MOD mtc$due_log_entry_count) + 1;
    mtv$due_log.next_i := i;

    mtv$due_log.total_due_count [due_state] [process_damaged] :=
          mtv$due_log.total_due_count [due_state] [process_damaged] + 1;

    mtv$due_log.dues [i].due_state := due_state;
    mtv$due_log.dues [i].process_damaged := process_damaged;
    mtv$due_log.dues [i].time := current_time;
    mtv$due_log.dues [i].task_id := cst_p^.taskid;
    mtv$due_log.dues [i].xp := xp_p^;

{ Unlock the DUE log.

    mtp$clear_lock (mtv$due_log.lock);

  PROCEND log_due;
?? OLDTITLE ??
?? NEWTITLE := 'log_aborted_task_message', EJECT ??

{ PURPOSE:
{   This procedure puts an informative message (CM1800) in the engineering log which details the termination
{   of a task because of a DUE.
{
{        LOG_ABORTED_TASK_MESSAGE
{          (    CST_P: ^OST$CPU_STATE_TABLE;
{               XCB_P: ^OST$EXECUTION_CONTROL_BLOCK);
{
{        CST_P: (INPUT) Specifies the pointer to the cpu state table of the processor with the DUE
{        XCB_P: (INPUT) Specifies the pointer to the execution control block of the task which was executing
{                       when the DUE occurred

  PROCEDURE log_aborted_task_message
    (    cst_p: ^ost$cpu_state_table;
         xcb_p: ^ost$execution_control_block);

    VAR
      message: ost$informative_message_record,
      message_logged: boolean;

{ Place some useful information in the text.  The format is as follows:
{ '$MMMM_SSSS_CCC_NNNN JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT DUE '
{   Chars 1-19: system_supplied_name
{   Char 20: blank
{   Char 21-51: user_supplied_name
{   Char 52: blank
{   Char 53-83: taskname
{   Char 84: blank
{   Char 85-88: 'DUE'

    message.message_type := cml$system_informative_message;
    message.message := ' ';
    message.message (1, 19) := cst_p^.jcb_p^.system_name;
    message.message (21, 31) := cst_p^.jcb_p^.jobname;
    message.message (53, 31) := xcb_p^.save9;
    message.message (85, 3) := 'DUE';

    dsp$report_system_message (#SEQ (message), dsc$general_du_error, dsc$informative_message, message_logged);

  PROCEND log_aborted_task_message;
?? OLDTITLE ??
MODEND mtm$process_due_errors;
